From 96784bcd02f81e371cffcc68b9918c57d45674f4 Mon Sep 17 00:00:00 2001 From: Simon Gerber Date: Fri, 12 Apr 2024 14:50:41 +0200 Subject: [PATCH] Add test case for OVNKubernetes alerts Rendered from template version: main (e0fcaa4) --- .cruft.json | 2 +- .github/workflows/test.yaml | 2 + Makefile.vars.mk | 2 +- .../apps/openshift4-monitoring.yaml | 0 .../00_namespace_labels.yaml | 23 + .../openshift4-monitoring/01_secrets.yaml | 0 .../02_aggregated_clusterroles.yaml | 17 + .../10_alertmanager_config.yaml | 39 + .../openshift4-monitoring/10_configmap.yaml | 54 + .../10_configmap_user_workload.yaml | 41 + .../20_networkpolicy.yaml | 62 + .../20_user_workload_networkpolicy.yaml | 62 + .../openshift4-monitoring/capacity_rules.yaml | 141 + .../prometheus_rules.yaml | 2853 +++++++++++++++++ .../openshift4-monitoring/rbac.yaml | 44 + .../openshift4-monitoring/silence.yaml | 107 + tests/ovn-kubernetes.yml | 15 + 17 files changed, 3462 insertions(+), 2 deletions(-) create mode 100644 tests/golden/ovn-kubernetes/openshift4-monitoring/apps/openshift4-monitoring.yaml create mode 100644 tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/00_namespace_labels.yaml create mode 100644 tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/01_secrets.yaml create mode 100644 tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/02_aggregated_clusterroles.yaml create mode 100644 tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/10_alertmanager_config.yaml create mode 100644 tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/10_configmap.yaml create mode 100644 tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/10_configmap_user_workload.yaml create mode 100644 tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/20_networkpolicy.yaml create mode 100644 tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/20_user_workload_networkpolicy.yaml create mode 100644 tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/capacity_rules.yaml create mode 100644 tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml create mode 100644 tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/rbac.yaml create mode 100644 tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/silence.yaml create mode 100644 tests/ovn-kubernetes.yml diff --git a/.cruft.json b/.cruft.json index 9349c13d..0617c66d 100644 --- a/.cruft.json +++ b/.cruft.json @@ -7,7 +7,7 @@ "name": "OpenShift4 Monitoring", "slug": "openshift4-monitoring", "parameter_key": "openshift4_monitoring", - "test_cases": "capacity-alerts remote-write user-workload-monitoring capacity-alerts-with-node-labels vsphere custom-rules release-4.13 team-routing release-4.14", + "test_cases": "capacity-alerts remote-write user-workload-monitoring capacity-alerts-with-node-labels vsphere custom-rules release-4.13 team-routing release-4.14 ovn-kubernetes", "add_lib": "y", "add_pp": "n", "add_golden": "y", diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index c383696c..e8c2834d 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -41,6 +41,7 @@ jobs: - release-4.13 - team-routing - release-4.14 + - ovn-kubernetes defaults: run: working-directory: ${{ env.COMPONENT_NAME }} @@ -64,6 +65,7 @@ jobs: - release-4.13 - team-routing - release-4.14 + - ovn-kubernetes defaults: run: working-directory: ${{ env.COMPONENT_NAME }} diff --git a/Makefile.vars.mk b/Makefile.vars.mk index 8e317d85..b0342cc0 100644 --- a/Makefile.vars.mk +++ b/Makefile.vars.mk @@ -57,4 +57,4 @@ KUBENT_IMAGE ?= ghcr.io/doitintl/kube-no-trouble:latest KUBENT_DOCKER ?= $(DOCKER_CMD) $(DOCKER_ARGS) $(root_volume) --entrypoint=/app/kubent $(KUBENT_IMAGE) instance ?= capacity-alerts -test_instances = tests/capacity-alerts.yml tests/remote-write.yml tests/user-workload-monitoring.yml tests/capacity-alerts-with-node-labels.yml tests/vsphere.yml tests/custom-rules.yml tests/release-4.13.yml tests/team-routing.yml tests/release-4.14.yml +test_instances = tests/capacity-alerts.yml tests/remote-write.yml tests/user-workload-monitoring.yml tests/capacity-alerts-with-node-labels.yml tests/vsphere.yml tests/custom-rules.yml tests/release-4.13.yml tests/team-routing.yml tests/release-4.14.yml tests/ovn-kubernetes.yml diff --git a/tests/golden/ovn-kubernetes/openshift4-monitoring/apps/openshift4-monitoring.yaml b/tests/golden/ovn-kubernetes/openshift4-monitoring/apps/openshift4-monitoring.yaml new file mode 100644 index 00000000..e69de29b diff --git a/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/00_namespace_labels.yaml b/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/00_namespace_labels.yaml new file mode 100644 index 00000000..4bc92396 --- /dev/null +++ b/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/00_namespace_labels.yaml @@ -0,0 +1,23 @@ +apiVersion: redhatcop.redhat.io/v1alpha1 +kind: Patch +metadata: + annotations: + argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true + labels: + name: namespace-openshift-monitoring-c4273dc15ddfdf7 + name: namespace-openshift-monitoring-c4273dc15ddfdf7 + namespace: syn-patch-operator +spec: + patches: + namespace-openshift-monitoring-c4273dc15ddfdf7-patch: + patchTemplate: |- + "metadata": + "labels": + "network.openshift.io/policy-group": "monitoring" + patchType: application/strategic-merge-patch+json + targetObjectRef: + apiVersion: v1 + kind: Namespace + name: openshift-monitoring + serviceAccountRef: + name: patch-sa diff --git a/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/01_secrets.yaml b/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/01_secrets.yaml new file mode 100644 index 00000000..e69de29b diff --git a/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/02_aggregated_clusterroles.yaml b/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/02_aggregated_clusterroles.yaml new file mode 100644 index 00000000..97a8cf95 --- /dev/null +++ b/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/02_aggregated_clusterroles.yaml @@ -0,0 +1,17 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + annotations: {} + labels: + name: syn-openshift4-monitoring-cluster-reader + rbac.authorization.k8s.io/aggregate-to-cluster-reader: 'true' + name: syn-openshift4-monitoring-cluster-reader +rules: + - apiGroups: + - monitoring.coreos.com + resources: + - '*' + verbs: + - get + - list + - watch diff --git a/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/10_alertmanager_config.yaml b/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/10_alertmanager_config.yaml new file mode 100644 index 00000000..5035dc3d --- /dev/null +++ b/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/10_alertmanager_config.yaml @@ -0,0 +1,39 @@ +apiVersion: v1 +data: {} +kind: Secret +metadata: + annotations: {} + labels: + name: alertmanager-main + name: alertmanager-main + namespace: openshift-monitoring +stringData: + alertmanager.yaml: |- + "inhibit_rules": + - "equal": + - "namespace" + - "alertname" + "source_match": + "severity": "critical" + "target_match_re": + "severity": "warning|info" + - "equal": + - "namespace" + - "alertname" + "source_match": + "severity": "warning" + "target_match_re": + "severity": "info" + "receivers": + - "name": "__component_openshift4_monitoring_null" + "route": + "group_interval": "5s" + "group_wait": "0s" + "repeat_interval": "10m" + "routes": + - "continue": false + "matchers": + - "namespace =~ \"\"" + "receiver": "__component_openshift4_monitoring_null" + - "receiver": "__component_openshift4_monitoring_null" +type: Opaque diff --git a/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/10_configmap.yaml b/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/10_configmap.yaml new file mode 100644 index 00000000..4588f8c0 --- /dev/null +++ b/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/10_configmap.yaml @@ -0,0 +1,54 @@ +apiVersion: v1 +data: + config.yaml: |- + "alertmanagerMain": + "nodeSelector": + "node-role.kubernetes.io/infra": "" + "volumeClaimTemplate": + "spec": + "resources": + "requests": + "storage": "2Gi" + "enableUserWorkload": true + "grafana": + "nodeSelector": + "node-role.kubernetes.io/infra": "" + "k8sPrometheusAdapter": + "nodeSelector": + "node-role.kubernetes.io/infra": "" + "kubeStateMetrics": + "nodeSelector": + "node-role.kubernetes.io/infra": "" + "openshiftStateMetrics": + "nodeSelector": + "node-role.kubernetes.io/infra": "" + "prometheusK8s": + "externalLabels": + "cluster_id": "c-green-test-1234" + "cluster_name": "Test Cluster 1234" + "tenant_id": "t-silent-test-1234" + "tenant_name": "Test Tenant 1234" + "nodeSelector": + "node-role.kubernetes.io/infra": "" + "retention": "8d" + "volumeClaimTemplate": + "spec": + "resources": + "requests": + "storage": "50Gi" + "prometheusOperator": + "nodeSelector": + "node-role.kubernetes.io/infra": "" + "telemeterClient": + "nodeSelector": + "node-role.kubernetes.io/infra": "" + "thanosQuerier": + "nodeSelector": + "node-role.kubernetes.io/infra": "" +kind: ConfigMap +metadata: + annotations: {} + labels: + name: cluster-monitoring-config + name: cluster-monitoring-config + namespace: openshift-monitoring diff --git a/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/10_configmap_user_workload.yaml b/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/10_configmap_user_workload.yaml new file mode 100644 index 00000000..08f4fff0 --- /dev/null +++ b/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/10_configmap_user_workload.yaml @@ -0,0 +1,41 @@ +apiVersion: v1 +data: + config.yaml: |- + "alertmanager": + "enableAlertmanagerConfig": true + "enabled": true + "nodeSelector": + "node-role.kubernetes.io/infra": "" + "volumeClaimTemplate": + "spec": + "resources": + "requests": + "storage": "2Gi" + "prometheus": + "externalLabels": + "cluster_id": "c-green-test-1234-user-workload" + "cluster_name": "Test Cluster 1234 User Workload" + "tenant_id": "t-silent-test-1234" + "tenant_name": "Test Tenant 1234" + "nodeSelector": + "node-role.kubernetes.io/infra": "" + "remoteWrite": [] + "retention": "8d" + "volumeClaimTemplate": + "spec": + "resources": + "requests": + "storage": "50Gi" + "prometheusOperator": + "nodeSelector": + "node-role.kubernetes.io/infra": "" + "thanosRuler": + "nodeSelector": + "node-role.kubernetes.io/infra": "" +kind: ConfigMap +metadata: + annotations: {} + labels: + name: user-workload-monitoring-config + name: user-workload-monitoring-config + namespace: openshift-user-workload-monitoring diff --git a/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/20_networkpolicy.yaml b/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/20_networkpolicy.yaml new file mode 100644 index 00000000..b04b9f76 --- /dev/null +++ b/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/20_networkpolicy.yaml @@ -0,0 +1,62 @@ +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + annotations: {} + labels: + name: alertmanager-allow-web + name: alertmanager-allow-web + namespace: openshift-monitoring +spec: + ingress: + - ports: + - port: 9092 + protocol: TCP + - port: 9093 + protocol: TCP + - port: 9095 + protocol: TCP + - port: 9097 + protocol: TCP + - from: + - namespaceSelector: {} + podSelector: + matchLabels: + app.kubernetes.io/name: alertmanager + policyTypes: + - Ingress +--- +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + annotations: {} + labels: + name: allow-same-namespace + name: allow-same-namespace + namespace: openshift-monitoring +spec: + ingress: + - from: + - podSelector: {} + podSelector: {} + policyTypes: + - Ingress +--- +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + annotations: {} + labels: + name: allow-non-alertmanager + name: allow-non-alertmanager + namespace: openshift-monitoring +spec: + ingress: + - {} + podSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: NotIn + values: + - alertmanager + policyTypes: + - Ingress diff --git a/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/20_user_workload_networkpolicy.yaml b/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/20_user_workload_networkpolicy.yaml new file mode 100644 index 00000000..683bc044 --- /dev/null +++ b/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/20_user_workload_networkpolicy.yaml @@ -0,0 +1,62 @@ +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + annotations: {} + labels: + name: alertmanager-allow-web + name: alertmanager-allow-web + namespace: openshift-user-workload-monitoring +spec: + ingress: + - ports: + - port: 9092 + protocol: TCP + - port: 9093 + protocol: TCP + - port: 9095 + protocol: TCP + - port: 9097 + protocol: TCP + - from: + - namespaceSelector: {} + podSelector: + matchLabels: + app.kubernetes.io/name: alertmanager + policyTypes: + - Ingress +--- +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + annotations: {} + labels: + name: allow-same-namespace + name: allow-same-namespace + namespace: openshift-user-workload-monitoring +spec: + ingress: + - from: + - podSelector: {} + podSelector: {} + policyTypes: + - Ingress +--- +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + annotations: {} + labels: + name: allow-non-alertmanager + name: allow-non-alertmanager + namespace: openshift-user-workload-monitoring +spec: + ingress: + - {} + podSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: NotIn + values: + - alertmanager + policyTypes: + - Ingress diff --git a/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/capacity_rules.yaml b/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/capacity_rules.yaml new file mode 100644 index 00000000..a430c4b2 --- /dev/null +++ b/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/capacity_rules.yaml @@ -0,0 +1,141 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + annotations: + syn_component: openshift4-monitoring + labels: + name: capacity + name: capacity + namespace: openshift-monitoring +spec: + groups: + - name: syn-CpuCapacity + rules: + - alert: SYN_ClusterCpuUsageHigh + annotations: + description: The cluster is close to using up all CPU resources. The cluster + might not be able to handle node failures or load spikes. Consider adding + new nodes. + message: Only {{ $value }} idle cpu cores accross cluster. + runbook_url: https://hub.syn.tools/openshift4-monitoring/runbooks/cpucapacity.html#SYN_ClusterCpuUsageHigh + syn_component: openshift4-monitoring + expr: sum(label_replace(rate(node_cpu_seconds_total{mode="idle"}[15m]), + "node", "$1", "instance", "(.+)") * on(node) group_left kube_node_role{role="app"}) + < 1.000000 * max((kube_node_status_capacity{resource="cpu"}) * on(node) + group_left kube_node_role{role="app"}) + for: 30m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-MemoryCapacity + rules: + - alert: SYN_ClusterLowOnMemory + annotations: + description: The cluster is close to using all of its memory. The cluster + might not be able to handle node failures or load spikes. Consider adding + new nodes. + message: Only {{ $value }} free memory on Worker Nodes. + runbook_url: https://hub.syn.tools/openshift4-monitoring/runbooks/memorycapacity.html#SYN_ClusterMemoryUsageHigh + syn_component: openshift4-monitoring + expr: sum(label_replace(node_memory_MemAvailable_bytes, "node", "$1", "instance", + "(.+)") * on(node) group_left kube_node_role{role="app"}) < 1.000000 * + max((kube_node_status_capacity{resource="memory"}) * on(node) group_left + kube_node_role{role="app"}) + for: 30m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-PodCapacity + rules: + - alert: SYN_TooManyPods + annotations: + description: The cluster is close to the limit of running pods. The cluster + might not be able to handle node failures and might not be able to start + new pods. Consider adding new nodes. + message: Only {{ $value }} more pods can be started. + runbook_url: https://hub.syn.tools/openshift4-monitoring/runbooks/podcapacity.html#SYN_TooManyPods + syn_component: openshift4-monitoring + expr: sum(kube_node_status_capacity{resource="pods"} * on(node) group_left + kube_node_role{role="app"}) - sum(kubelet_running_pods * on(node) group_left + kube_node_role{role="app"}) < 1.000000 * max((kube_node_status_capacity{resource="pods"}) + * on(node) group_left kube_node_role{role="app"}) + for: 30m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-ResourceRequests + rules: + - alert: SYN_TooMuchCPURequested + annotations: + description: The cluster is close to assigning all CPU resources to running + pods. The cluster might not be able to handle node failures and might + soon not be able to start new pods. Consider adding new nodes. + message: Only {{ $value }} cpu cores left for new pods. + runbook_url: https://hub.syn.tools/openshift4-monitoring/runbooks/resourcerequests.html#SYN_TooMuchCPURequested + syn_component: openshift4-monitoring + expr: sum(kube_node_status_allocatable{resource="cpu"} * on(node) group_left + kube_node_role{role="app"}) - sum(kube_pod_resource_request{resource="cpu"} + * on(node) group_left kube_node_role{role="app"}) < 1.000000 * max((kube_node_status_allocatable{resource="cpu"}) + * on(node) group_left kube_node_role{role="app"}) + for: 30m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_TooMuchMemoryRequested + annotations: + description: The cluster is close to assigning all memory to running pods. + The cluster might not be able to handle node failures and might not + be able to start new pods. Consider adding new nodes. + message: Only {{ $value }} memory left for new pods. + runbook_url: https://hub.syn.tools/openshift4-monitoring/runbooks/resourcerequests.html#SYN_TooMuchMemoryRequested + syn_component: openshift4-monitoring + expr: sum(kube_node_status_allocatable{resource="memory"} * on(node) group_left + kube_node_role{role="app"}) - sum(kube_pod_resource_request{resource="memory"} + * on(node) group_left kube_node_role{role="app"}) < 1.000000 * max((kube_node_status_allocatable{resource="memory"}) + * on(node) group_left kube_node_role{role="app"}) + for: 30m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-UnusedCapacity + rules: + - alert: SYN_ClusterHasUnusedNodes + annotations: + description: The cluster has {{ $value }} unused nodes. Consider removing + unused nodes. + message: Cluster has unused nodes. + runbook_url: https://hub.syn.tools/openshift4-monitoring/runbooks/unusedcapacity.html#SYN_ClusterHasUnusedNodes + syn_component: openshift4-monitoring + expr: |- + min(( + label_replace( + (sum(kube_node_status_capacity{resource="pods"} * on(node) group_left kube_node_role{role="app"}) - sum(kubelet_running_pods * on(node) group_left kube_node_role{role="app"})) / max((kube_node_status_capacity{resource="pods"}) * on(node) group_left kube_node_role{role="app"}) + , "resource", "pods", "", "") + ) or ( + label_replace( + (sum(kube_node_status_allocatable{resource="memory"} * on(node) group_left kube_node_role{role="app"}) - sum(kube_pod_resource_request{resource="memory"} * on(node) group_left kube_node_role{role="app"})) / max((kube_node_status_allocatable{resource="memory"}) * on(node) group_left kube_node_role{role="app"}) + , "resource", "requested_memory", "", "") + ) or ( + label_replace( + (sum(kube_node_status_allocatable{resource="cpu"} * on(node) group_left kube_node_role{role="app"}) - sum(kube_pod_resource_request{resource="cpu"} * on(node) group_left kube_node_role{role="app"})) / max((kube_node_status_allocatable{resource="cpu"}) * on(node) group_left kube_node_role{role="app"}) + , "resource", "requested_cpu", "", "") + ) or ( + label_replace( + sum(label_replace(node_memory_MemAvailable_bytes, "node", "$1", "instance", "(.+)") * on(node) group_left kube_node_role{role="app"}) / max((kube_node_status_capacity{resource="memory"}) * on(node) group_left kube_node_role{role="app"}) + , "resource", "memory", "", "") + ) or ( + label_replace( + sum(label_replace(rate(node_cpu_seconds_total{mode="idle"}[15m]), "node", "$1", "instance", "(.+)") * on(node) group_left kube_node_role{role="app"}) / max((kube_node_status_capacity{resource="cpu"}) * on(node) group_left kube_node_role{role="app"}) + , "resource", "cpu", "", "") + ) + ) > 4.000000 + for: 8h + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring diff --git a/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml new file mode 100644 index 00000000..c8a58cd6 --- /dev/null +++ b/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -0,0 +1,2853 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + role: alert-rules + name: syn-k8s-rules + namespace: openshift-monitoring +spec: + groups: + - name: syn-alertmanager.rules + rules: + - alert: SYN_AlertmanagerClusterDown + annotations: + description: '{{ $value | humanizePercentage }} of Alertmanager instances + within the {{$labels.job}} cluster have been up for less than half of + the last 5m.' + summary: Half or more of the Alertmanager instances within the same cluster + are down. + syn_component: openshift4-monitoring + expr: | + ( + count by (namespace,service) ( + avg_over_time(up{job=~"alertmanager-main|alertmanager-user-workload"}[5m]) < 0.5 + ) + / + count by (namespace,service) ( + up{job=~"alertmanager-main|alertmanager-user-workload"} + ) + ) + >= 0.5 + for: 5m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_AlertmanagerClusterFailedToSendAlerts + annotations: + description: The minimum notification failure rate to {{ $labels.integration + }} sent from any instance in the {{$labels.job}} cluster is {{ $value + | humanizePercentage }}. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/AlertmanagerClusterFailedToSendAlerts.md + summary: All Alertmanager instances in a cluster failed to send notifications + to a critical integration. + syn_component: openshift4-monitoring + expr: | + min by (namespace,service, integration) ( + rate(alertmanager_notifications_failed_total{job=~"alertmanager-main|alertmanager-user-workload", integration=~`.*`}[5m]) + / + ignoring (reason) group_left rate(alertmanager_notifications_total{job=~"alertmanager-main|alertmanager-user-workload", integration=~`.*`}[5m]) + ) + > 0.01 + for: 5m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_AlertmanagerConfigInconsistent + annotations: + description: Alertmanager instances within the {{$labels.job}} cluster + have different configurations. + summary: Alertmanager instances within the same cluster have different + configurations. + syn_component: openshift4-monitoring + expr: | + count by (namespace,service) ( + count_values by (namespace,service) ("config_hash", alertmanager_config_hash{job=~"alertmanager-main|alertmanager-user-workload"}) + ) + != 1 + for: 20m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_AlertmanagerFailedReload + annotations: + description: Configuration has failed to load for {{ $labels.namespace + }}/{{ $labels.pod}}. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/AlertmanagerFailedReload.md + summary: Reloading an Alertmanager configuration has failed. + syn_component: openshift4-monitoring + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(alertmanager_config_last_reload_successful{job=~"alertmanager-main|alertmanager-user-workload"}[5m]) == 0 + for: 10m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_AlertmanagerFailedToSendAlerts + annotations: + description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} failed + to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration + }}. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/AlertmanagerFailedToSendAlerts.md + summary: An Alertmanager instance failed to send notifications. + syn_component: openshift4-monitoring + expr: | + ( + rate(alertmanager_notifications_failed_total{job=~"alertmanager-main|alertmanager-user-workload"}[5m]) + / + ignoring (reason) group_left rate(alertmanager_notifications_total{job=~"alertmanager-main|alertmanager-user-workload"}[5m]) + ) + > 0.01 + for: 5m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_AlertmanagerMembersInconsistent + annotations: + description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} has + only found {{ $value }} members of the {{$labels.job}} cluster. + summary: A member of an Alertmanager cluster has not found all other cluster + members. + syn_component: openshift4-monitoring + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(alertmanager_cluster_members{job=~"alertmanager-main|alertmanager-user-workload"}[5m]) + < on (namespace,service) group_left + count by (namespace,service) (max_over_time(alertmanager_cluster_members{job=~"alertmanager-main|alertmanager-user-workload"}[5m])) + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-cluster-machine-approver.rules + rules: + - alert: SYN_MachineApproverMaxPendingCSRsReached + annotations: + description: | + The number of pending CertificateSigningRequests has exceeded the + maximum threshold (current number of machine + 100). Check the + pending CSRs to determine which machines need approval, also check + that the nodelink controller is running in the openshift-machine-api + namespace. + summary: max pending CSRs threshold reached. + syn_component: openshift4-monitoring + expr: | + mapi_current_pending_csr > mapi_max_pending_csr + for: 5m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-cluster-network-operator-master.rules + rules: + - alert: SYN_NoOvnMasterLeader + annotations: + description: | + Networking control plane is degraded. Networking configuration updates applied to the cluster will not be + implemented while there is no OVN Kubernetes leader. Existing workloads should continue to have connectivity. + OVN-Kubernetes control plane is not functional. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-network-operator/NoOvnMasterLeader.md + summary: There is no ovn-kubernetes master leader. + syn_component: openshift4-monitoring + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max by (namespace) (max_over_time(ovnkube_master_leader[5m])) == 0 + for: 5m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_NoRunningOvnMaster + annotations: + description: | + Networking control plane is degraded. Networking configuration updates applied to the cluster will not be + implemented while there are no OVN Kubernetes pods. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-network-operator/NoRunningOvnMaster.md + summary: There is no running ovn-kubernetes master. + syn_component: openshift4-monitoring + expr: | + absent(up{job="ovnkube-master", namespace="openshift-ovn-kubernetes"} == 1) + for: 5m + labels: + namespace: openshift-ovn-kubernetes + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_NorthboundStale + annotations: + description: | + Networking control plane is degraded. Networking configuration updates applied to the cluster will not be + implemented. Existing workloads should continue to have connectivity. OVN-Kubernetes control plane and/or + OVN northbound database may not be functional. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-network-operator/NorthboundStaleAlert.md + summary: ovn-kubernetes has not written anything to the northbound database + for too long. + syn_component: openshift4-monitoring + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + time() - max_over_time(ovnkube_master_nb_e2e_timestamp[5m]) > 120 + for: 10m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_OVNKubernetesNorthboundDatabaseClusterIDError + annotations: + description: More than one OVN northbound database cluster ID indicates + degraded OVN database high availability and possible database split + brain. + summary: Multiple OVN northbound database cluster IDs exist. + syn_component: openshift4-monitoring + expr: | + # Without min_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + count(count(min_over_time(ovn_db_cluster_id{db_name="OVN_Northbound"}[5m])) by (cluster_id, namespace)) by (namespace) > 1 + for: 5m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_OVNKubernetesNorthboundDatabaseClusterMemberError + annotations: + description: OVN northbound database server(s) has not been a RAFT cluster + member for a period of time which may indicate degraded OVN database + high availability cluster. + summary: OVN northbound database server(s) has not been a member of the + databases high availability for a period of time. + syn_component: openshift4-monitoring + expr: | + # Without min_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + min_over_time(cluster:ovn_db_nbdb_not_cluster_member:abs[5m]) != 0 + for: 5m + labels: + namespace: openshift-ovn-kubernetes + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_OVNKubernetesNorthboundDatabaseInboundConnectionError + annotations: + description: OVN northbound database server(s) is experiencing inbound + RAFT connectivity errors which may indicate degraded OVN database high + availability. + summary: OVN northbound database server(s) is experiencing inbound RAFT + connectivity errors. + syn_component: openshift4-monitoring + expr: | + # Without min_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + # ..error_total is set to zero when error resolves itself + min_over_time(ovn_db_cluster_inbound_connections_error_total{db_name="OVN_Northbound"}[5m]) > 0 + for: 5m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_OVNKubernetesNorthboundDatabaseInboundConnectionMissing + annotations: + description: OVN northbound database server(s) do not have expected number + of inbound connections for a RAFT cluster which may indicate degraded + OVN database high availability. + summary: OVN northbound database server(s) do not have expected number + of inbound RAFT connections. + syn_component: openshift4-monitoring + expr: | + # Expected sum of inbound connections is number of control plane nodes * number of control plane nodes minus one + # Without min_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + min_over_time(cluster:ovn_db_nbdb_missing_inbound_connections:abs[5m]) != 0 + for: 5m + labels: + namespace: openshift-ovn-kubernetes + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_OVNKubernetesNorthboundDatabaseLeaderError + annotations: + description: OVN northbound database(s) have no RAFT leader. Networking + control plane is degraded. + summary: OVN northbound database(s) have no RAFT leader + syn_component: openshift4-monitoring + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + count(max_over_time(ovn_db_cluster_server_role{db_name="OVN_Northbound", server_role="leader"}[5m])) by (namespace) == 0 + for: 5m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_OVNKubernetesNorthboundDatabaseMultipleLeadersError + annotations: + description: OVN northbound database(s) have multiple RAFT leaders which + may indicate degraded OVN database high availability. + summary: OVN northbound database(s) have multiple RAFT leaders + syn_component: openshift4-monitoring + expr: | + # Without min_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + count(min_over_time(ovn_db_cluster_server_role{db_name="OVN_Northbound", server_role="leader"}[1m])) by (leader, namespace) > 1 + for: 5m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_OVNKubernetesNorthboundDatabaseOutboundConnectionError + annotations: + description: OVN northbound database server(s) outbound RAFT connectivity + errors may indicate degraded OVN database high availability. + summary: OVN northbound database server(s) is experiencing outbound RAFT + connectivity errors. + syn_component: openshift4-monitoring + expr: | + # Without min_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + # ..error_total is set to zero when error resolves itself + min_over_time(ovn_db_cluster_outbound_connections_error_total{db_name="OVN_Northbound"}[5m]) > 0 + for: 5m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_OVNKubernetesNorthboundDatabaseOutboundConnectionMissing + annotations: + description: OVN northbound database server(s) do not have expected number + of outbound connections for a RAFT cluster which may indicate degraded + OVN database high availability. + summary: OVN northbound database server(s) do not have expected number + of outbound RAFT connections. + syn_component: openshift4-monitoring + expr: | + # Expected sum of outbound connections is number of control plane nodes * number of control plane nodes minus one + # Without min_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + min_over_time(cluster:ovn_db_nbdb_missing_outbound_connections:abs[5m]) != 0 + for: 5m + labels: + namespace: openshift-ovn-kubernetes + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_OVNKubernetesNorthboundDatabaseTermLag + annotations: + description: OVN northbound database(s) RAFT term have not been equal + which may indicate degraded OVN database high availability. + summary: OVN northbound databases RAFT term have not been equal for a + period of time. + syn_component: openshift4-monitoring + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max(max_over_time(ovn_db_cluster_term{db_name="OVN_Northbound"}[5m])) by (namespace) - min(max_over_time(ovn_db_cluster_term{db_name="OVN_Northbound"}[5m])) by (namespace) > 0 + for: 25m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_OVNKubernetesNorthdInactive + annotations: + description: Exactly one OVN northd must have an active status within + the high availability set. Networking control plane is degraded. + summary: Exactly one OVN northd instance must have an active status. + syn_component: openshift4-monitoring + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + count(ovn_northd_status == 1) by (namespace) != 1 + for: 5m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_OVNKubernetesSouthboundDatabaseClusterIDError + annotations: + description: More than one OVN southbound database cluster ID indicates + degraded OVN database high availability and possible database split + brain. + summary: Multiple OVN southbound database cluster IDs exist. + syn_component: openshift4-monitoring + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + count(count(min_over_time(ovn_db_cluster_id{db_name="OVN_Southbound"}[5m])) by (cluster_id, namespace)) by (namespace) > 1 + for: 5m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_OVNKubernetesSouthboundDatabaseClusterMemberError + annotations: + description: OVN southbound database server(s) has not been a RAFT cluster + member for a period of time which may indicate degraded OVN database + high availability. + summary: OVN southbound database server(s) has not been a member of the + databases high availability for a period of time. + syn_component: openshift4-monitoring + expr: | + # Without min_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + min_over_time(cluster:ovn_db_sbdb_not_cluster_member:abs[5m]) != 0 + for: 5m + labels: + namespace: openshift-ovn-kubernetes + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_OVNKubernetesSouthboundDatabaseInboundConnectionError + annotations: + description: OVN southbound database server(s) is experiencing inbound + RAFT connectivity errors which may indicate degraded OVN database high + availability. + summary: OVN southbound database server(s) is experiencing inbound RAFT + connectivity errors. + syn_component: openshift4-monitoring + expr: | + # Without min_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + # ..error_total is set to zero when error resolves itself + min_over_time(ovn_db_cluster_inbound_connections_error_total{db_name="OVN_Southbound"}[5m]) > 0 + for: 5m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_OVNKubernetesSouthboundDatabaseInboundConnectionMissing + annotations: + description: OVN southbound database server(s) do not have expected number + of inbound connections for a RAFT cluster which may indicate degraded + OVN database high availability. + summary: OVN southbound database server(s) do not have expected number + of inbound RAFT connections. + syn_component: openshift4-monitoring + expr: | + # Expected sum of inbound connections is number of control plane nodes * number of control plane nodes minus one + # Without min_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + min_over_time(cluster:ovn_db_sbdb_missing_inbound_connections:abs[5m]) != 0 + for: 5m + labels: + namespace: openshift-ovn-kubernetes + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_OVNKubernetesSouthboundDatabaseLeaderError + annotations: + description: OVN southbound database(s) have no leader. Networking control + plane is degraded. + summary: OVN southbound database(s) have no RAFT leader + syn_component: openshift4-monitoring + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + count(max_over_time(ovn_db_cluster_server_role{db_name="OVN_Southbound", server_role="leader"}[5m])) by (namespace) == 0 + for: 5m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_OVNKubernetesSouthboundDatabaseMultipleLeadersError + annotations: + description: OVN southbound database(s) have multiple RAFT leaders which + may indicate degraded OVN database high availability. + summary: OVN southbound database(s) have multiple RAFT leaders + syn_component: openshift4-monitoring + expr: | + # Without min_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + count(min_over_time(ovn_db_cluster_server_role{db_name="OVN_Southbound", server_role="leader"}[1m])) by (leader, namespace) > 1 + for: 5m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_OVNKubernetesSouthboundDatabaseOutboundConnectionError + annotations: + description: OVN southbound database server(s) outbound RAFT connectivity + errors which may indicate degraded OVN database high availability. + summary: OVN southbound database server(s) is experiencing outbound RAFT + connectivity errors. + syn_component: openshift4-monitoring + expr: | + # Without min_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + # ..error_total is set to zero when error resolves itself + min_over_time(ovn_db_cluster_outbound_connections_error_total{db_name="OVN_Southbound"}[5m]) > 0 + for: 5m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_OVNKubernetesSouthboundDatabaseOutboundConnectionMissing + annotations: + description: OVN southbound database server(s) do not have expected number + of outbound connections for a RAFT cluster which may indicate degraded + OVN database high availability. + summary: OVN southbound database server(s) do not have expected number + of outbound RAFT connections. + syn_component: openshift4-monitoring + expr: | + # Expected sum of outbound connections is number of control plane nodes * number of control plane nodes minus one + # Without min_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + min_over_time(cluster:ovn_db_sbdb_missing_outbound_connections:abs[5m]) != 0 + for: 5m + labels: + namespace: openshift-ovn-kubernetes + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_OVNKubernetesSouthboundDatabaseTermLag + annotations: + description: OVN southbound database(s) RAFT term have not been equal + which may indicate degraded OVN database high availability. + summary: OVN southbound databases RAFT term have not been equal for a + period of time. + syn_component: openshift4-monitoring + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max(max_over_time(ovn_db_cluster_term{db_name="OVN_Southbound"}[5m])) by (namespace) - min(max_over_time(ovn_db_cluster_term{db_name="OVN_Southbound"}[5m])) by (namespace) > 0 + for: 25m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_SouthboundStale + annotations: + description: | + Networking control plane is degraded. Networking configuration updates may not be applied to the cluster or + taking a long time to apply. This usually means there is a large load on OVN component 'northd' or it is not + functioning. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-network-operator/SouthboundStaleAlert.md + summary: ovn-northd has not successfully synced any changes to the southbound + DB for too long. + syn_component: openshift4-monitoring + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(ovnkube_master_nb_e2e_timestamp[5m]) - max_over_time(ovnkube_master_sb_e2e_timestamp[5m]) > 120 + for: 10m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_V4SubnetAllocationThresholdExceeded + annotations: + description: More than 80% of IPv4 subnets are used. Insufficient IPv4 + subnets could degrade provisioning of workloads. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-network-operator/V4SubnetAllocationThresholdExceeded.md + summary: More than 80% of v4 subnets available to assign to the nodes + are allocated. Current v4 subnet allocation percentage is {{ $value + | humanizePercentage }}. + syn_component: openshift4-monitoring + expr: ovnkube_clustermanager_allocated_v4_host_subnets / ovnkube_clustermanager_num_v4_host_subnets + > 0.8 + for: 10m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_V6SubnetAllocationThresholdExceeded + annotations: + description: More than 80% of IPv6 subnets are used. Insufficient IPv6 + subnets could degrade provisioning of workloads. + summary: More than 80% of the v6 subnets available to assign to the nodes + are allocated. Current v6 subnet allocation percentage is {{ $value + | humanizePercentage }}. + syn_component: openshift4-monitoring + expr: ovnkube_clustermanager_allocated_v6_host_subnets / ovnkube_clustermanager_num_v6_host_subnets + > 0.8 + for: 10m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-cluster-network-operator-ovn.rules + rules: + - alert: SYN_NodeWithoutOVNKubeNodePodRunning + annotations: + description: | + Networking is degraded on nodes that do not have a functioning ovnkube-node pod. Existing workloads on the + node may continue to have connectivity but any changes to the networking control plane will not be implemented. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-network-operator/NodeWithoutOVNKubeNodePodRunning.md + summary: All Linux nodes should be running an ovnkube-node pod, {{ $labels.node + }} is not. + syn_component: openshift4-monitoring + expr: | + (kube_node_info unless on(node) (kube_pod_info{namespace="openshift-ovn-kubernetes",pod=~"ovnkube-node.*"} + or kube_node_labels{label_kubernetes_io_os="windows"})) > 0 + for: 20m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_OVNKubernetesControllerDisconnectedSouthboundDatabase + annotations: + description: | + Networking is degraded on nodes when OVN controller is not connected to OVN southbound database connection. No networking control plane updates will be applied to the node. + summary: Networking control plane is degraded on node {{ $labels.node + }} because OVN controller is not connected to OVN southbound database. + syn_component: openshift4-monitoring + expr: | + max_over_time(ovn_controller_southbound_database_connected[5m]) == 0 + for: 10m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_OVNKubernetesNodeOVSOverflowKernel + annotations: + description: Netlink messages dropped by OVS kernel module due to netlink + socket buffer overflow. This will result in packet loss. + summary: OVS kernel module drops packets due to buffer overflow. + syn_component: openshift4-monitoring + expr: increase(ovs_vswitchd_dp_flows_lookup_lost[5m]) > 0 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_OVNKubernetesNodeOVSOverflowUserspace + annotations: + description: Netlink messages dropped by OVS vSwitch daemon due to netlink + socket buffer overflow. This will result in packet loss. + summary: OVS vSwitch daemon drops packets due to buffer overflow. + syn_component: openshift4-monitoring + expr: increase(ovs_vswitchd_netlink_overflow[5m]) > 0 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_OVNKubernetesNodePodAddError + annotations: + description: OVN Kubernetes experiences pod creation errors at an elevated + rate. The pods will be retried. + summary: OVN Kubernetes is experiencing pod creation errors at an elevated + rate. + syn_component: openshift4-monitoring + expr: | + (sum by(instance, namespace) (rate(ovnkube_node_cni_request_duration_seconds_count{command="ADD",err="true"}[5m])) + / + sum by(instance, namespace) (rate(ovnkube_node_cni_request_duration_seconds_count{command="ADD"}[5m]))) + > 0.1 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_OVNKubernetesNodePodDeleteError + annotations: + description: OVN Kubernetes experiences pod deletion errors at an elevated + rate. The pods will be retried. + summary: OVN Kubernetes experiencing pod deletion errors at an elevated + rate. + syn_component: openshift4-monitoring + expr: | + (sum by(instance, namespace) (rate(ovnkube_node_cni_request_duration_seconds_count{command="DEL",err="true"}[5m])) + / + sum by(instance, namespace) (rate(ovnkube_node_cni_request_duration_seconds_count{command="DEL"}[5m]))) + > 0.1 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_OVNKubernetesResourceRetryFailure + annotations: + description: | + OVN Kubernetes failed to apply networking control plane configuration after several attempts. This might be because the configuration + provided by the user is invalid or because of an internal error. As a consequence, the cluster might have a degraded status. + summary: OVN Kubernetes failed to apply networking control plane configuration. + syn_component: openshift4-monitoring + expr: increase(ovnkube_resource_retry_failures_total[10m]) > 0 + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-cluster-operators + rules: + - alert: SYN_ClusterOperatorDegraded + annotations: + description: The {{ $labels.name }} operator is degraded because {{ $labels.reason + }}, and the components it manages may have reduced quality of service. Cluster + upgrades may not complete. For more information refer to 'oc get -o + yaml clusteroperator {{ $labels.name }}'{{ with $console_url := "console_url" + | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} or + {{ label "url" (first $console_url ) }}/settings/cluster/{{ end }}{{ + end }}. + summary: Cluster operator has been degraded for 30 minutes. + syn_component: openshift4-monitoring + expr: | + max by (namespace, name, reason) + ( + ( + cluster_operator_conditions{job="cluster-version-operator", condition="Degraded"} + or on (namespace, name) + group by (namespace, name) (cluster_operator_up{job="cluster-version-operator"}) + ) == 1 + ) + for: 30m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_ClusterOperatorDown + annotations: + description: The {{ $labels.name }} operator may be down or disabled because + ${{ $labels.reason }}, and the components it manages may be unavailable + or degraded. Cluster upgrades may not complete. For more information + refer to 'oc get -o yaml clusteroperator {{ $labels.name }}'{{ with + $console_url := "console_url" | query }}{{ if ne (len (label "url" (first + $console_url ) ) ) 0}} or {{ label "url" (first $console_url ) }}/settings/cluster/{{ + end }}{{ end }}. + summary: Cluster operator has not been available for 10 minutes. + syn_component: openshift4-monitoring + expr: | + max by (namespace, name, reason) (cluster_operator_up{job="cluster-version-operator"} == 0) + for: 10m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_ClusterOperatorFlapping + annotations: + description: The {{ $labels.name }} operator behavior might cause upgrades + to be unstable. For more information refer to 'oc get -o yaml clusteroperator + {{ $labels.name }}'{{ with $console_url := "console_url" | query }}{{ + if ne (len (label "url" (first $console_url ) ) ) 0}} or {{ label "url" + (first $console_url ) }}/settings/cluster/{{ end }}{{ end }}. + summary: Cluster operator up status is changing often. + syn_component: openshift4-monitoring + expr: | + max by (namespace, name) (changes(cluster_operator_up{job="cluster-version-operator"}[2m]) > 2) + for: 10m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-cluster-version + rules: + - alert: SYN_ClusterReleaseNotAccepted + annotations: + description: The desired cluster release has not been accepted because + ${{ $labels.reason }}, and the cluster will continue to reconcile an + earlier release instead of moving towards that desired release. For + more information refer to 'oc adm upgrade'{{ with $console_url := "console_url" + | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} or + {{ label "url" (first $console_url ) }}/settings/cluster/{{ end }}{{ + end }}. + summary: The desired cluster release has not been accepted for at least + an hour. + syn_component: openshift4-monitoring + expr: | + max by (namespace, name, reason) (cluster_operator_conditions{name="version", condition="ReleaseAccepted", endpoint="metrics"} == 0) + for: 60m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_ClusterVersionOperatorDown + annotations: + description: The operator may be down or disabled. The cluster will not + be kept up to date and upgrades will not be possible. Inspect the openshift-cluster-version + namespace for events or changes to the cluster-version-operator deployment + or pods to diagnose and repair. {{ with $console_url := "console_url" + | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} For + more information refer to {{ label "url" (first $console_url ) }}/k8s/cluster/projects/openshift-cluster-version.{{ + end }}{{ end }} + summary: Cluster version operator has disappeared from Prometheus target + discovery. + syn_component: openshift4-monitoring + expr: | + absent(up{job="cluster-version-operator"} == 1) + for: 10m + labels: + namespace: openshift-cluster-version + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeControllerManagerDown + annotations: + description: KubeControllerManager has disappeared from Prometheus target + discovery. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-controller-manager-operator/KubeControllerManagerDown.md + summary: Target disappeared from Prometheus target discovery. + syn_component: openshift4-monitoring + expr: | + absent(up{job="kube-controller-manager"} == 1) + for: 15m + labels: + namespace: openshift-kube-controller-manager + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeSchedulerDown + annotations: + description: KubeScheduler has disappeared from Prometheus target discovery. + summary: Target disappeared from Prometheus target discovery. + syn_component: openshift4-monitoring + expr: | + absent(up{job="scheduler"} == 1) + for: 15m + labels: + namespace: openshift-kube-scheduler + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PodDisruptionBudgetLimit + annotations: + description: The pod disruption budget is below the minimum disruptions + allowed level and is not satisfied. The number of current healthy pods + is less than the desired healthy pods. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-controller-manager-operator/PodDisruptionBudgetLimit.md + summary: The pod disruption budget registers insufficient amount of pods. + syn_component: openshift4-monitoring + expr: | + max by (namespace, poddisruptionbudget) (kube_poddisruptionbudget_status_current_healthy < kube_poddisruptionbudget_status_desired_healthy) + for: 15m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-control-plane-cpu-utilization + rules: + - alert: SYN_ExtremelyHighIndividualControlPlaneCPU + annotations: + description: Extreme CPU pressure can cause slow serialization and poor + performance from the kube-apiserver and etcd. When this happens, there + is a risk of clients seeing non-responsive API requests which are issued + again causing even more CPU pressure. It can also cause failing liveness + probes due to slow etcd responsiveness on the backend. If one kube-apiserver + fails under this condition, chances are you will experience a cascade + as the remaining kube-apiservers are also under-provisioned. To fix + this, increase the CPU and memory on your control plane nodes. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-apiserver-operator/ExtremelyHighIndividualControlPlaneCPU.md + summary: Sustained high CPU utilization on a single control plane node, + more CPU pressure is likely to cause a failover; increase available + CPU. + syn_component: openshift4-monitoring + expr: | + 100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1m])) * 100) > 90 AND on (instance) label_replace( kube_node_role{role="master"}, "instance", "$1", "node", "(.+)" ) + for: 1h + labels: + namespace: openshift-kube-apiserver + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_HighOverallControlPlaneCPU + annotations: + description: Given three control plane nodes, the overall CPU utilization + may only be about 2/3 of all available capacity. This is because if + a single control plane node fails, the remaining two must handle the + load of the cluster in order to be HA. If the cluster is using more + than 2/3 of all capacity, if one control plane node fails, the remaining + two are likely to fail when they take the load. To fix this, increase + the CPU and memory on your control plane nodes. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-apiserver-operator/ExtremelyHighIndividualControlPlaneCPU.md + summary: CPU utilization across all three control plane nodes is higher + than two control plane nodes can sustain; a single control plane node + outage may cause a cascading failure; increase available CPU. + syn_component: openshift4-monitoring + expr: | + sum( + 100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1m])) * 100) + AND on (instance) label_replace( kube_node_role{role="master"}, "instance", "$1", "node", "(.+)" ) + ) + / + count(kube_node_role{role="master"}) + > 60 + for: 10m + labels: + namespace: openshift-kube-apiserver + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-etcd + rules: + - alert: SYN_etcdDatabaseQuotaLowSpace + annotations: + description: 'etcd cluster "{{ $labels.job }}": database size exceeds + the defined quota on etcd instance {{ $labels.instance }}, please defrag + or increase the quota as the writes to etcd will be disabled when it + is full.' + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdDatabaseQuotaLowSpace.md + summary: etcd cluster database is running full. + syn_component: openshift4-monitoring + expr: | + (last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]) / last_over_time(etcd_server_quota_backend_bytes[5m]))*100 > 95 + for: 10m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_etcdHighFsyncDurations + annotations: + description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync + durations are {{ $value }}s on etcd instance {{ $labels.instance }}.' + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdHighFsyncDurations.md + summary: etcd cluster 99th percentile fsync durations are too high. + syn_component: openshift4-monitoring + expr: | + histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) + > 1 + for: 10m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_etcdHighNumberOfFailedProposals + annotations: + description: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal + failures within the last 30 minutes on etcd instance {{ $labels.instance + }}.' + summary: etcd cluster has high number of proposal failures. + syn_component: openshift4-monitoring + expr: | + rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_etcdMembersDown + annotations: + description: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value + }}).' + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdMembersDown.md + summary: etcd cluster members are down. + syn_component: openshift4-monitoring + expr: | + max without (endpoint) ( + sum without (instance) (up{job=~".*etcd.*"} == bool 0) + or + count without (To) ( + sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01 + ) + ) + > 0 + for: 10m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_etcdNoLeader + annotations: + description: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance + }} has no leader.' + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdNoLeader.md + summary: etcd cluster has no leader. + syn_component: openshift4-monitoring + expr: | + etcd_server_has_leader{job=~".*etcd.*"} == 0 + for: 1m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-extremely-high-individual-control-plane-memory + rules: + - alert: SYN_ExtremelyHighIndividualControlPlaneMemory + annotations: + description: The memory utilization per instance within control plane + nodes influence the stability, and responsiveness of the cluster. This + can lead to cluster instability and slow responses from kube-apiserver + or failing requests specially on etcd. Moreover, OOM kill is expected + which negatively influences the pod scheduling. If this happens on container + level, the descheduler will not be able to detect it, as it works on + the pod level. To fix this, increase memory of the affected node of + control plane nodes. + summary: Extreme memory utilization per node within control plane nodes + is extremely high, and could impact responsiveness and stability. + syn_component: openshift4-monitoring + expr: | + ( + 1 + - + sum by (instance) ( + node_memory_MemFree_bytes + + node_memory_Buffers_bytes + + node_memory_Cached_bytes + AND on (instance) + label_replace( kube_node_role{role="master"}, "instance", "$1", "node", "(.+)" ) + ) / sum by (instance) ( + node_memory_MemTotal_bytes + AND on (instance) + label_replace( kube_node_role{role="master"}, "instance", "$1", "node", "(.+)" ) + ) + ) * 100 > 90 + for: 45m + labels: + namespace: openshift-machine-config-operator + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-general.rules + rules: + - alert: Watchdog + annotations: + description: | + This is an alert meant to ensure that the entire alerting pipeline is functional. + This alert is always firing, therefore it should always be firing in Alertmanager + and always fire against a receiver. There are integrations with various notification + mechanisms that send a notification when this alert is not firing. For example the + "DeadMansSnitch" integration in PagerDuty. + summary: An alert that should always be firing to certify that Alertmanager + is working properly. + syn_component: openshift4-monitoring + expr: vector(1) + labels: + namespace: openshift-monitoring + severity: none + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-high-overall-control-plane-memory + rules: + - alert: SYN_HighOverallControlPlaneMemory + annotations: + description: | + The overall memory usage is high. + kube-apiserver and etcd might be slow to respond. + To fix this, increase memory of the control plane nodes. + + This alert was adjusted to be less sensitive in 4.11. + Newer Go versions use more memory, if available, to reduce GC pauses. + + Old memory behavior can be restored by setting `GOGC=63`. + See https://bugzilla.redhat.com/show_bug.cgi?id=2074031 for more details. + summary: Memory utilization across all control plane nodes is high, and + could impact responsiveness and stability. + syn_component: openshift4-monitoring + expr: | + ( + 1 + - + sum ( + node_memory_MemFree_bytes + + node_memory_Buffers_bytes + + node_memory_Cached_bytes + AND on (instance) + label_replace( kube_node_role{role="master"}, "instance", "$1", "node", "(.+)" ) + ) / sum ( + node_memory_MemTotal_bytes + AND on (instance) + label_replace( kube_node_role{role="master"}, "instance", "$1", "node", "(.+)" ) + ) + ) * 100 > 80 + for: 1h + labels: + namespace: openshift-machine-config-operator + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-kube-state-metrics + rules: + - alert: SYN_KubeStateMetricsWatchErrors + annotations: + description: kube-state-metrics is experiencing errors at an elevated + rate in watch operations. This is likely causing it to not be able to + expose metrics about Kubernetes objects correctly or at all. + summary: kube-state-metrics is experiencing errors in watch operations. + syn_component: openshift4-monitoring + expr: | + (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) by (cluster) + / + sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])) by (cluster)) + > 0.01 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-kubernetes-apps + rules: + - alert: SYN_KubeContainerWaiting + annotations: + description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} + on container {{ $labels.container}} has been in waiting state for longer + than 1 hour. + summary: Pod container waiting longer than 1 hour + syn_component: openshift4-monitoring + expr: | + sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}) > 0 + for: 1h + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeDaemonSetMisScheduled + annotations: + description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ + $labels.daemonset }} are running where they are not supposed to run.' + summary: DaemonSet pods are misscheduled. + syn_component: openshift4-monitoring + expr: | + kube_daemonset_status_number_misscheduled{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} > 0 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeDaemonSetNotScheduled + annotations: + description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ + $labels.daemonset }} are not scheduled.' + summary: DaemonSet pods are not scheduled. + syn_component: openshift4-monitoring + expr: | + kube_daemonset_status_desired_number_scheduled{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} + - + kube_daemonset_status_current_number_scheduled{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} > 0 + for: 10m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeDaemonSetRolloutStuck + annotations: + description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} + has not finished or progressed for at least 30 minutes. + summary: DaemonSet rollout is stuck. + syn_component: openshift4-monitoring + expr: | + ( + ( + kube_daemonset_status_current_number_scheduled{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} + != + kube_daemonset_status_desired_number_scheduled{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} + ) or ( + kube_daemonset_status_number_misscheduled{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} + != + 0 + ) or ( + kube_daemonset_status_updated_number_scheduled{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} + != + kube_daemonset_status_desired_number_scheduled{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} + ) or ( + kube_daemonset_status_number_available{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} + != + kube_daemonset_status_desired_number_scheduled{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} + ) + ) and ( + changes(kube_daemonset_status_updated_number_scheduled{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}[5m]) + == + 0 + ) + for: 30m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeDeploymentGenerationMismatch + annotations: + description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment + }} does not match, this indicates that the Deployment has failed but + has not been rolled back. + summary: Deployment generation mismatch due to possible roll-back + syn_component: openshift4-monitoring + expr: | + kube_deployment_status_observed_generation{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} + != + kube_deployment_metadata_generation{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeDeploymentRolloutStuck + annotations: + description: Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment + }} is not progressing for longer than 15 minutes. + summary: Deployment rollout is not progressing. + syn_component: openshift4-monitoring + expr: | + kube_deployment_status_condition{condition="Progressing", status="false",namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} + != 0 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeJobFailed + annotations: + description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed + to complete. Removing failed job after investigation should clear this + alert. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubeJobFailed.md + summary: Job failed to complete. + syn_component: openshift4-monitoring + expr: | + kube_job_failed{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} > 0 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeJobNotCompleted + annotations: + description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking + more than {{ "43200" | humanizeDuration }} to complete. + summary: Job did not complete in time + syn_component: openshift4-monitoring + expr: | + time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} + and + kube_job_status_active{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} > 0) > 43200 + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubePodCrashLooping + annotations: + description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container + }}) is in waiting state (reason: "CrashLoopBackOff").' + summary: Pod is crash looping. + syn_component: openshift4-monitoring + expr: | + max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}[5m]) >= 1 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubePodNotReady + annotations: + description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in + a non-ready state for longer than 15 minutes. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubePodNotReady.md + summary: Pod has been in a non-ready state for more than 15 minutes. + syn_component: openshift4-monitoring + expr: | + sum by (namespace, pod, cluster) ( + max by(namespace, pod, cluster) ( + kube_pod_status_phase{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)", job="kube-state-metrics", phase=~"Pending|Unknown"} + unless ignoring(phase) (kube_pod_status_unschedulable{job="kube-state-metrics"} == 1) + ) * on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) ( + 1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"}) + ) + ) > 0 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeStatefulSetGenerationMismatch + annotations: + description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset + }} does not match, this indicates that the StatefulSet has failed but + has not been rolled back. + summary: StatefulSet generation mismatch due to possible roll-back + syn_component: openshift4-monitoring + expr: | + kube_statefulset_status_observed_generation{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} + != + kube_statefulset_metadata_generation{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeStatefulSetReplicasMismatch + annotations: + description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset + }} has not matched the expected number of replicas for longer than 15 + minutes. + summary: StatefulSet has not matched the expected number of replicas. + syn_component: openshift4-monitoring + expr: | + ( + kube_statefulset_status_replicas_ready{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} + != + kube_statefulset_status_replicas{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} + ) and ( + changes(kube_statefulset_status_replicas_updated{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}[10m]) + == + 0 + ) + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeStatefulSetUpdateNotRolledOut + annotations: + description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset + }} update has not been rolled out. + summary: StatefulSet update has not been rolled out. + syn_component: openshift4-monitoring + expr: | + ( + max without (revision) ( + kube_statefulset_status_current_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} + unless + kube_statefulset_status_update_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} + ) + * + ( + kube_statefulset_replicas{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} + != + kube_statefulset_status_replicas_updated{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} + ) + ) and ( + changes(kube_statefulset_status_replicas_updated{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}[5m]) + == + 0 + ) + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-kubernetes-storage + rules: + - alert: SYN_KubePersistentVolumeErrors + annotations: + description: The persistent volume {{ $labels.persistentvolume }} {{ with + $labels.cluster -}} on Cluster {{ . }} {{- end }} has status {{ $labels.phase + }}. + summary: PersistentVolume is having issues with provisioning. + syn_component: openshift4-monitoring + expr: | + kube_persistentvolume_status_phase{phase=~"Failed|Pending",namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} > 0 + for: 5m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubePersistentVolumeFillingUp + annotations: + description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim + }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} + on Cluster {{ . }} {{- end }} is only {{ $value | humanizePercentage + }} free. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubePersistentVolumeFillingUp.md + summary: PersistentVolume is filling up. + syn_component: openshift4-monitoring + expr: | + ( + kubelet_volume_stats_available_bytes{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kubelet", metrics_path="/metrics"} + / + kubelet_volume_stats_capacity_bytes{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kubelet", metrics_path="/metrics"} + ) < 0.03 + and + kubelet_volume_stats_used_bytes{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kubelet", metrics_path="/metrics"} > 0 + unless on(cluster, namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_access_mode{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)", access_mode="ReadOnlyMany"} == 1 + unless on(cluster, namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_labels{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",label_alerts_k8s_io_kube_persistent_volume_filling_up="disabled"} == 1 + for: 1m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubePersistentVolumeFillingUp + annotations: + description: Based on recent sampling, the PersistentVolume claimed by + {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace + }} {{ with $labels.cluster -}} on Cluster {{ . }} {{- end }} is expected + to fill up within four days. Currently {{ $value | humanizePercentage + }} is available. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubePersistentVolumeFillingUp.md + summary: PersistentVolume is filling up. + syn_component: openshift4-monitoring + expr: | + ( + kubelet_volume_stats_available_bytes{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kubelet", metrics_path="/metrics"} + / + kubelet_volume_stats_capacity_bytes{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kubelet", metrics_path="/metrics"} + ) < 0.15 + and + kubelet_volume_stats_used_bytes{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kubelet", metrics_path="/metrics"} > 0 + and + predict_linear(kubelet_volume_stats_available_bytes{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kubelet", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 + unless on(cluster, namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_access_mode{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)", access_mode="ReadOnlyMany"} == 1 + unless on(cluster, namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_labels{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",label_alerts_k8s_io_kube_persistent_volume_filling_up="disabled"} == 1 + for: 1h + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubePersistentVolumeInodesFillingUp + annotations: + description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim + }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} + on Cluster {{ . }} {{- end }} only has {{ $value | humanizePercentage + }} free inodes. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubePersistentVolumeInodesFillingUp.md + summary: PersistentVolumeInodes are filling up. + syn_component: openshift4-monitoring + expr: | + ( + kubelet_volume_stats_inodes_free{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kubelet", metrics_path="/metrics"} + / + kubelet_volume_stats_inodes{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kubelet", metrics_path="/metrics"} + ) < 0.03 + and + kubelet_volume_stats_inodes_used{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kubelet", metrics_path="/metrics"} > 0 + unless on(cluster, namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_access_mode{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)", access_mode="ReadOnlyMany"} == 1 + unless on(cluster, namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_labels{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",label_alerts_k8s_io_kube_persistent_volume_filling_up="disabled"} == 1 + for: 1m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubePersistentVolumeInodesFillingUp + annotations: + description: Based on recent sampling, the PersistentVolume claimed by + {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace + }} {{ with $labels.cluster -}} on Cluster {{ . }} {{- end }} is expected + to run out of inodes within four days. Currently {{ $value | humanizePercentage + }} of its inodes are free. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubePersistentVolumeInodesFillingUp.md + summary: PersistentVolumeInodes are filling up. + syn_component: openshift4-monitoring + expr: | + ( + kubelet_volume_stats_inodes_free{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kubelet", metrics_path="/metrics"} + / + kubelet_volume_stats_inodes{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kubelet", metrics_path="/metrics"} + ) < 0.15 + and + kubelet_volume_stats_inodes_used{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kubelet", metrics_path="/metrics"} > 0 + and + predict_linear(kubelet_volume_stats_inodes_free{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kubelet", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 + unless on(cluster, namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_access_mode{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)", access_mode="ReadOnlyMany"} == 1 + unless on(cluster, namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_labels{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",label_alerts_k8s_io_kube_persistent_volume_filling_up="disabled"} == 1 + for: 1h + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-kubernetes-system + rules: + - alert: SYN_KubeClientErrors + annotations: + description: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance + }}' is experiencing {{ $value | humanizePercentage }} errors.' + summary: Kubernetes API server client is experiencing errors. + syn_component: openshift4-monitoring + expr: | + (sum(rate(rest_client_requests_total{job="apiserver",code=~"5.."}[5m])) by (cluster, instance, job, namespace) + / + sum(rate(rest_client_requests_total{job="apiserver"}[5m])) by (cluster, instance, job, namespace)) + > 0.01 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-kubernetes-system-apiserver + rules: + - alert: SYN_KubeAPIDown + annotations: + description: KubeAPI has disappeared from Prometheus target discovery. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubeAPIDown.md + summary: Target disappeared from Prometheus target discovery. + syn_component: openshift4-monitoring + expr: | + absent(up{job="apiserver"} == 1) + for: 15m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeAPITerminatedRequests + annotations: + description: The kubernetes apiserver has terminated {{ $value | humanizePercentage + }} of its incoming requests. + summary: The kubernetes apiserver has terminated {{ $value | humanizePercentage + }} of its incoming requests. + syn_component: openshift4-monitoring + expr: | + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20 + for: 5m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeAggregatedAPIDown + annotations: + description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace + }} has been only {{ $value | humanize }}% available over the last 10m. + summary: Kubernetes aggregated API is down. + syn_component: openshift4-monitoring + expr: | + (1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice{job="apiserver"}[10m]))) * 100 < 85 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeAggregatedAPIErrors + annotations: + description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace + }} has reported errors. It has appeared unavailable {{ $value | humanize + }} times averaged over the past 10m. + summary: Kubernetes aggregated API has reported errors. + syn_component: openshift4-monitoring + expr: | + sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total{job="apiserver"}[10m])) > 4 + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-kubernetes-system-kubelet + rules: + - alert: SYN_KubeNodeNotReady + annotations: + description: '{{ $labels.node }} has been unready for more than 15 minutes.' + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubeNodeNotReady.md + summary: Node is not ready. + syn_component: openshift4-monitoring + expr: | + kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeNodeReadinessFlapping + annotations: + description: The readiness status of node {{ $labels.node }} has changed + {{ $value }} times in the last 15 minutes. + summary: Node readiness status is flapping. + syn_component: openshift4-monitoring + expr: | + sum(changes(kube_node_status_condition{job="kube-state-metrics",status="true",condition="Ready"}[15m])) by (cluster, node) > 2 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeNodeUnreachable + annotations: + description: '{{ $labels.node }} is unreachable and some workloads may + be rescheduled.' + summary: Node is unreachable. + syn_component: openshift4-monitoring + expr: | + (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeletClientCertificateRenewalErrors + annotations: + description: Kubelet on node {{ $labels.node }} has failed to renew its + client certificate ({{ $value | humanize }} errors in the last 5 minutes). + summary: Kubelet has failed to renew its client certificate. + syn_component: openshift4-monitoring + expr: | + increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeletDown + annotations: + description: Kubelet has disappeared from Prometheus target discovery. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubeletDown.md + summary: Target disappeared from Prometheus target discovery. + syn_component: openshift4-monitoring + expr: | + absent(up{job="kubelet", metrics_path="/metrics"} == 1) + for: 15m + labels: + namespace: kube-system + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeletPlegDurationHigh + annotations: + description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile + duration of {{ $value }} seconds on node {{ $labels.node }}. + summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist. + syn_component: openshift4-monitoring + expr: | + node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 + for: 5m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeletPodStartUpLatencyHigh + annotations: + description: Kubelet Pod startup 99th percentile latency is {{ $value + }} seconds on node {{ $labels.node }}. + summary: Kubelet Pod startup latency is too high. + syn_component: openshift4-monitoring + expr: | + histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le)) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeletServerCertificateRenewalErrors + annotations: + description: Kubelet on node {{ $labels.node }} has failed to renew its + server certificate ({{ $value | humanize }} errors in the last 5 minutes). + summary: Kubelet has failed to renew its server certificate. + syn_component: openshift4-monitoring + expr: | + increase(kubelet_server_expiration_renew_errors[5m]) > 0 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-machine-api-operator-metrics-collector-up + rules: + - alert: SYN_MachineAPIOperatorMetricsCollectionFailing + annotations: + description: 'For more details: oc logs + -n openshift-machine-api' + summary: machine api operator metrics collection is failing. + syn_component: openshift4-monitoring + expr: | + mapi_mao_collector_up == 0 + for: 5m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-machine-health-check-unterminated-short-circuit + rules: + - alert: SYN_MachineHealthCheckUnterminatedShortCircuit + annotations: + description: | + The number of unhealthy machines has exceeded the `maxUnhealthy` limit for the check, you should check + the status of machines in the cluster. + summary: machine health check {{ $labels.name }} has been disabled by + short circuit for more than 30 minutes + syn_component: openshift4-monitoring + expr: | + mapi_machinehealthcheck_short_circuit == 1 + for: 30m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-machine-not-yet-deleted + rules: + - alert: SYN_MachineNotYetDeleted + annotations: + description: | + The machine is not properly deleting, this may be due to a configuration issue with the + infrastructure provider, or because workloads on the node have PodDisruptionBudgets or + long termination periods which are preventing deletion. + summary: machine {{ $labels.name }} has been in Deleting phase for more + than 6 hours + syn_component: openshift4-monitoring + expr: | + sum by (name, namespace) (avg_over_time(mapi_machine_created_timestamp_seconds{phase="Deleting"}[15m])) > 0 + for: 360m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-machine-with-no-running-phase + rules: + - alert: SYN_MachineWithNoRunningPhase + annotations: + description: | + The machine has been without a Running or Deleting phase for more than 60 minutes. + The machine may not have been provisioned properly from the infrastructure provider, or + it might have issues with CertificateSigningRequests being approved. + summary: 'machine {{ $labels.name }} is in phase: {{ $labels.phase }}' + syn_component: openshift4-monitoring + expr: | + sum by (name, namespace) (mapi_machine_created_timestamp_seconds{phase!~"Running|Deleting"}) > 0 + for: 60m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-machine-without-valid-node-ref + rules: + - alert: SYN_MachineWithoutValidNode + annotations: + description: | + If the machine never became a node, you should diagnose the machine related failures. + If the node was deleted from the API, you may delete the machine if appropriate. + summary: machine {{ $labels.name }} does not have valid node reference + syn_component: openshift4-monitoring + expr: | + sum by (name, namespace) (mapi_machine_created_timestamp_seconds unless on(node) kube_node_info) > 0 + for: 60m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-mcc-drain-error + rules: + - alert: SYN_MCCDrainError + annotations: + message: 'Drain failed on {{ $labels.exported_node }} , updates may be + blocked. For more details check MachineConfigController pod logs: oc + logs -f -n {{ $labels.namespace }} machine-config-controller-xxxxx -c + machine-config-controller' + syn_component: openshift4-monitoring + expr: | + mcc_drain_err > 0 + labels: + namespace: openshift-machine-config-operator + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-mcd-kubelet-health-state-error + rules: + - alert: SYN_KubeletHealthState + annotations: + message: Kubelet health failure threshold reached + syn_component: openshift4-monitoring + expr: | + mcd_kubelet_state > 2 + labels: + namespace: openshift-machine-config-operator + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-mcd-pivot-error + rules: + - alert: SYN_MCDPivotError + annotations: + message: 'Error detected in pivot logs on {{ $labels.node }} , upgrade + may be blocked. For more details: oc logs -f -n {{ $labels.namespace + }} {{ $labels.pod }} -c machine-config-daemon ' + syn_component: openshift4-monitoring + expr: | + mcd_pivot_errors_total > 0 + for: 2m + labels: + namespace: openshift-machine-config-operator + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-mcd-reboot-error + rules: + - alert: SYN_MCDRebootError + annotations: + message: 'Reboot failed on {{ $labels.node }} , update may be blocked. + For more details: oc logs -f -n {{ $labels.namespace }} {{ $labels.pod + }} -c machine-config-daemon ' + syn_component: openshift4-monitoring + expr: | + mcd_reboots_failed_total > 0 + for: 5m + labels: + namespace: openshift-machine-config-operator + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-node-exporter + rules: + - alert: SYN_NodeBondingDegraded + annotations: + description: Bonding interface {{ $labels.master }} on {{ $labels.instance + }} is in degraded state due to one or more slave failures. + summary: Bonding interface is degraded + syn_component: openshift4-monitoring + expr: | + (node_bonding_slaves - node_bonding_active) != 0 + for: 5m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_NodeClockNotSynchronising + annotations: + description: Clock at {{ $labels.instance }} is not synchronising. Ensure + NTP is configured on this host. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeClockNotSynchronising.md + summary: Clock not synchronising. + syn_component: openshift4-monitoring + expr: | + min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0 + and + node_timex_maxerror_seconds{job="node-exporter"} >= 16 + for: 10m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_NodeClockSkewDetected + annotations: + description: Clock at {{ $labels.instance }} is out of sync by more than + 0.05s. Ensure NTP is configured correctly on this host. + summary: Clock skew detected. + syn_component: openshift4-monitoring + expr: | + ( + node_timex_offset_seconds{job="node-exporter"} > 0.05 + and + deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0 + ) + or + ( + node_timex_offset_seconds{job="node-exporter"} < -0.05 + and + deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0 + ) + for: 10m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_NodeDiskIOSaturation + annotations: + description: | + Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above 10 for the last 30 minutes, is currently at {{ printf "%.2f" $value }}. + This symptom might indicate disk saturation. + summary: Disk IO queue is high. + syn_component: openshift4-monitoring + expr: | + rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m]) > 10 + for: 30m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_NodeFileDescriptorLimit + annotations: + description: File descriptors limit at {{ $labels.instance }} is currently + at {{ printf "%.2f" $value }}%. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeFileDescriptorLimit.md + summary: Kernel is predicted to exhaust file descriptors limit soon. + syn_component: openshift4-monitoring + expr: | + ( + node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90 + ) + for: 15m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_NodeFilesystemAlmostOutOfFiles + annotations: + description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint + }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available + inodes left. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeFilesystemAlmostOutOfFiles.md + summary: Filesystem has less than 3% inodes left. + syn_component: openshift4-monitoring + expr: | + ( + node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!~"/var/lib/ibmc-s3fs.*"} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!~"/var/lib/ibmc-s3fs.*"} * 100 < 3 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!~"/var/lib/ibmc-s3fs.*"} == 0 + ) + for: 1h + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_NodeFilesystemAlmostOutOfSpace + annotations: + description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint + }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available + space left. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeFilesystemAlmostOutOfSpace.md + summary: Filesystem has less than 3% space left. + syn_component: openshift4-monitoring + expr: | + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!~"/var/lib/ibmc-s3fs.*"} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!~"/var/lib/ibmc-s3fs.*"} * 100 < 3 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!~"/var/lib/ibmc-s3fs.*"} == 0 + ) + for: 30m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_NodeFilesystemFilesFillingUp + annotations: + description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint + }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available + inodes left and is filling up fast. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeFilesystemFilesFillingUp.md + summary: Filesystem is predicted to run out of inodes within the next + 4 hours. + syn_component: openshift4-monitoring + expr: | + ( + node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!~"/var/lib/ibmc-s3fs.*"} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!~"/var/lib/ibmc-s3fs.*"} * 100 < 20 + and + predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!~"/var/lib/ibmc-s3fs.*"}[6h], 4*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!~"/var/lib/ibmc-s3fs.*"} == 0 + ) + for: 1h + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_NodeFilesystemSpaceFillingUp + annotations: + description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint + }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available + space left and is filling up. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeFilesystemSpaceFillingUp.md + summary: Filesystem is predicted to run out of space within the next 24 + hours. + syn_component: openshift4-monitoring + expr: | + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!~"/var/lib/ibmc-s3fs.*"} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!~"/var/lib/ibmc-s3fs.*"} * 100 < 15 + and + predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!~"/var/lib/ibmc-s3fs.*"}[6h], 24*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!~"/var/lib/ibmc-s3fs.*"} == 0 + ) + for: 1h + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_NodeFilesystemSpaceFillingUp + annotations: + description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint + }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available + space left and is filling up fast. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeFilesystemSpaceFillingUp.md + summary: Filesystem is predicted to run out of space within the next 4 + hours. + syn_component: openshift4-monitoring + expr: | + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!~"/var/lib/ibmc-s3fs.*"} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!~"/var/lib/ibmc-s3fs.*"} * 100 < 10 + and + predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!~"/var/lib/ibmc-s3fs.*"}[6h], 4*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!~"/var/lib/ibmc-s3fs.*"} == 0 + ) + for: 1h + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_NodeHighNumberConntrackEntriesUsed + annotations: + description: '{{ $value | humanizePercentage }} of conntrack entries are + used.' + summary: Number of conntrack are getting close to the limit. + syn_component: openshift4-monitoring + expr: | + (node_nf_conntrack_entries{job="node-exporter"} / node_nf_conntrack_entries_limit) > 0.75 + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_NodeMemoryHighUtilization + annotations: + description: | + Memory is filling up at {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%. + summary: Host is running out of memory. + syn_component: openshift4-monitoring + expr: | + 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} * 100) > 90 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_NodeMemoryMajorPagesFaults + annotations: + description: | + Memory major pages are occurring at very high rate at {{ $labels.instance }}, 500 major page faults per second for the last 15 minutes, is currently at {{ printf "%.2f" $value }}. + Please check that there is enough memory available at this instance. + summary: Memory major page faults are occurring at very high rate. + syn_component: openshift4-monitoring + expr: | + rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) > 500 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_NodeNetworkReceiveErrs + annotations: + description: '{{ $labels.instance }} interface {{ $labels.device }} has + encountered {{ printf "%.0f" $value }} receive errors in the last two + minutes.' + summary: Network interface is reporting many receive errors. + syn_component: openshift4-monitoring + expr: | + rate(node_network_receive_errs_total{job="node-exporter"}[2m]) / rate(node_network_receive_packets_total{job="node-exporter"}[2m]) > 0.01 + for: 1h + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_NodeNetworkTransmitErrs + annotations: + description: '{{ $labels.instance }} interface {{ $labels.device }} has + encountered {{ printf "%.0f" $value }} transmit errors in the last two + minutes.' + summary: Network interface is reporting many transmit errors. + syn_component: openshift4-monitoring + expr: | + rate(node_network_transmit_errs_total{job="node-exporter"}[2m]) / rate(node_network_transmit_packets_total{job="node-exporter"}[2m]) > 0.01 + for: 1h + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_NodeSystemdServiceFailed + annotations: + description: Systemd service {{ $labels.name }} has entered failed state + at {{ $labels.instance }} + summary: Systemd service has entered failed state. + syn_component: openshift4-monitoring + expr: | + node_systemd_unit_state{job="node-exporter", state="failed"} == 1 + for: 5m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-node-network + rules: + - alert: SYN_NodeNetworkInterfaceFlapping + annotations: + description: Network interface "{{ $labels.device }}" changing its up + status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod + }} + summary: Network interface is often changing its status + syn_component: openshift4-monitoring + expr: | + changes(node_network_up{job="node-exporter",device!~"veth.+|tunbr"}[2m]) > 2 + for: 2m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-node-utilization + rules: + - alert: SYN_node_memory_free_percent + annotations: + message: '{{ $labels.node }}: Memory usage more than 97% (current value + is: {{ $value | humanizePercentage }})%' + syn_component: openshift4-monitoring + expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes + > 0.97 + for: 30m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-openshift-etcd.rules + rules: + - alert: SYN_etcdGRPCRequestsSlow + annotations: + description: 'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC + requests is {{ $value }}s on etcd instance {{ $labels.instance }} for + {{ $labels.grpc_method }} method.' + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdGRPCRequestsSlow.md + summary: etcd grpc requests are slow + syn_component: openshift4-monitoring + expr: | + histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd", grpc_method!="Defragment", grpc_type="unary"}[10m])) without(grpc_type)) + > 1 + for: 30m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_etcdHighNumberOfFailedGRPCRequests + annotations: + description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests + for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance + }}.' + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdHighNumberOfFailedGRPCRequests.md + summary: etcd cluster has high number of failed grpc requests. + syn_component: openshift4-monitoring + expr: | + (sum(rate(grpc_server_handled_total{job="etcd", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code) + / + (sum(rate(grpc_server_handled_total{job="etcd"}[5m])) without (grpc_type, grpc_code) + > 2 and on ()(sum(cluster_infrastructure_provider{type!~"ipi|BareMetal"} == bool 1)))) * 100 > 50 + for: 10m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_etcdHighNumberOfLeaderChanges + annotations: + description: 'etcd cluster "{{ $labels.job }}": {{ $value }} average leader + changes within the last 10 minutes. Frequent elections may be a sign + of insufficient resources, high network latency, or disruptions by other + components and should be investigated.' + summary: etcd cluster has high number of leader changes. + syn_component: openshift4-monitoring + expr: | + avg(changes(etcd_server_is_leader[10m])) > 5 + for: 5m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_etcdInsufficientMembers + annotations: + description: etcd is reporting fewer instances are available than are + needed ({{ $value }}). When etcd does not have a majority of instances + available the Kubernetes and OpenShift APIs will reject read and write + requests and operations that preserve the health of workloads cannot + be performed. This can occur when multiple control plane nodes are powered + off or are unable to connect to each other via the network. Check that + all control plane nodes are powered on and that network connections + between each machine are functional. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-etcd-operator/etcdInsufficientMembers.md + summary: etcd is reporting that a majority of instances are unavailable. + syn_component: openshift4-monitoring + expr: sum(up{job="etcd"} == bool 1 and etcd_server_has_leader{job="etcd"} + == bool 1) without (instance,pod) < ((count(up{job="etcd"}) without (instance,pod) + + 1) / 2) + for: 3m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-openshift-general.rules + rules: + - alert: SYN_TargetDown + annotations: + description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ + $labels.service }} targets in {{ $labels.namespace }} namespace have + been unreachable for more than 15 minutes. This may be a symptom of + network connectivity issues, down nodes, or failures within these components. + Assess the health of the infrastructure and nodes running these targets + and then contact support.' + summary: Some targets were not reachable from the monitoring server for + an extended period of time. + syn_component: openshift4-monitoring + expr: | + 100 * (( + 1 - sum by (job, namespace, service) (up and on(namespace, pod) kube_pod_info) / + count by (job, namespace, service) (up and on(namespace, pod) kube_pod_info) + ) or ( + count by (job, namespace, service) (up == 0) / + count by (job, namespace, service) (up) + )) > 10 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-openshift-ingress-to-route-controller.rules + rules: + - alert: SYN_UnmanagedRoutes + annotations: + description: This alert fires when there is a Route owned by an unmanaged + Ingress. + message: Route {{ $labels.name }} is owned by an unmanaged Ingress. + summary: Route owned by an Ingress no longer managed + syn_component: openshift4-monitoring + expr: openshift_ingress_to_route_controller_route_with_unmanaged_owner == + 1 + for: 1h + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-openshift-ingress.rules + rules: + - alert: SYN_HAProxyDown + annotations: + description: This alert fires when metrics report that HAProxy is down. + message: HAProxy metrics are reporting that HAProxy is down on pod {{ + $labels.namespace }} / {{ $labels.pod }} + summary: HAProxy is down + syn_component: openshift4-monitoring + expr: haproxy_up == 0 + for: 5m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_HAProxyReloadFail + annotations: + description: This alert fires when HAProxy fails to reload its configuration, + which will result in the router not picking up recently created or modified + routes. + message: HAProxy reloads are failing on {{ $labels.pod }}. Router is not + respecting recently created or modified routes + summary: HAProxy reload failure + syn_component: openshift4-monitoring + expr: template_router_reload_failure == 1 + for: 5m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_IngressControllerDegraded + annotations: + description: This alert fires when the IngressController status is degraded. + message: | + The {{ $labels.namespace }}/{{ $labels.name }} ingresscontroller is + degraded: {{ $labels.reason }}. + summary: IngressController is degraded + syn_component: openshift4-monitoring + expr: ingress_controller_conditions{condition="Degraded"} == 1 + for: 5m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_IngressControllerUnavailable + annotations: + description: This alert fires when the IngressController is not available. + message: | + The {{ $labels.namespace }}/{{ $labels.name }} ingresscontroller is + unavailable: {{ $labels.reason }}. + summary: IngressController is unavailable + syn_component: openshift4-monitoring + expr: ingress_controller_conditions{condition="Available"} == 0 + for: 5m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-openshift-kubernetes.rules + rules: + - alert: SYN_ClusterMonitoringOperatorReconciliationErrors + annotations: + description: Errors are occurring during reconciliation cycles. Inspect + the cluster-monitoring-operator log for potential root causes. + summary: Cluster Monitoring Operator is experiencing unexpected reconciliation + errors. + syn_component: openshift4-monitoring + expr: max_over_time(cluster_monitoring_operator_last_reconciliation_successful[5m]) + == 0 + for: 1h + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubeDeploymentReplicasMismatch + annotations: + description: Deployment {{ $labels.namespace }}/{{ $labels.deployment + }} has not matched the expected number of replicas for longer than 15 + minutes. This indicates that cluster infrastructure is unable to start + or restart the necessary components. This most often occurs when one + or more nodes are down or partioned from the cluster, or a fault occurs + on the node that prevents the workload from starting. In rare cases + this may indicate a new version of a cluster component cannot start + due to a bug or configuration error. Assess the pods for this deployment + to verify they are running on healthy nodes and then contact support. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubeDeploymentReplicasMismatch.md + summary: Deployment has not matched the expected number of replicas + syn_component: openshift4-monitoring + expr: | + ((( + kube_deployment_spec_replicas{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} + > + kube_deployment_status_replicas_available{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} + ) and ( + changes(kube_deployment_status_replicas_updated{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}[5m]) + == + 0 + )) * on() group_left cluster:control_plane:all_nodes_ready) > 0 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_KubePodNotScheduled + annotations: + description: |- + Pod {{ $labels.namespace }}/{{ $labels.pod }} cannot be scheduled for more than 30 minutes. + Check the details of the pod with the following command: + oc describe -n {{ $labels.namespace }} pod {{ $labels.pod }} + summary: Pod cannot be scheduled. + syn_component: openshift4-monitoring + expr: last_over_time(kube_pod_status_unschedulable{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)"}[5m]) + == 1 + for: 30m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-prometheus + rules: + - alert: SYN_PrometheusBadConfig + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed + to reload its configuration. + summary: Failed Prometheus configuration reload. + syn_component: openshift4-monitoring + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(prometheus_config_last_reload_successful{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) == 0 + for: 10m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusDuplicateTimestamps + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping + {{ printf "%.4g" $value }} samples/s with different values but duplicated + timestamp. + summary: Prometheus is dropping samples with duplicate timestamps. + syn_component: openshift4-monitoring + expr: | + rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) > 0 + for: 1h + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusErrorSendingAlertsToSomeAlertmanagers + annotations: + description: '{{ printf "%.1f" $value }}% errors while sending alerts + from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager + {{$labels.alertmanager}}.' + summary: Prometheus has encountered more than 1% errors sending alerts + to a specific Alertmanager. + syn_component: openshift4-monitoring + expr: | + ( + rate(prometheus_notifications_errors_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) + / + rate(prometheus_notifications_sent_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) + ) + * 100 + > 1 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusHighQueryLoad + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} query API + has less than 20% available capacity in its query engine for the last + 15 minutes. + summary: Prometheus is reaching its maximum capacity serving concurrent + requests. + syn_component: openshift4-monitoring + expr: | + avg_over_time(prometheus_engine_queries{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) > 0.8 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusLabelLimitHit + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped + {{ printf "%.0f" $value }} targets because some samples exceeded the + configured label_limit, label_name_length_limit or label_value_length_limit. + summary: Prometheus has dropped targets because some scrape configs have + exceeded the labels limit. + syn_component: openshift4-monitoring + expr: | + increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) > 0 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusMissingRuleEvaluations + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed + {{ printf "%.0f" $value }} rule group evaluations in the last 5m. + summary: Prometheus is missing rule evaluations due to slow rule group + evaluation. + syn_component: openshift4-monitoring + expr: | + increase(prometheus_rule_group_iterations_missed_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) > 0 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusNotConnectedToAlertmanagers + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected + to any Alertmanagers. + summary: Prometheus is not connected to any Alertmanagers. + syn_component: openshift4-monitoring + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(prometheus_notifications_alertmanagers_discovered{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) < 1 + for: 10m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusNotIngestingSamples + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting + samples. + summary: Prometheus is not ingesting samples. + syn_component: openshift4-monitoring + expr: | + ( + sum without(type) (rate(prometheus_tsdb_head_samples_appended_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m])) <= 0 + and + ( + sum without(scrape_job) (prometheus_target_metadata_cache_entries{job=~"prometheus-k8s|prometheus-user-workload"}) > 0 + or + sum without(rule_group) (prometheus_rule_group_rules{job=~"prometheus-k8s|prometheus-user-workload"}) > 0 + ) + ) + for: 10m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusNotificationQueueRunningFull + annotations: + description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}} + is running full. + summary: Prometheus alert notification queue predicted to run full in + less than 30m. + syn_component: openshift4-monitoring + expr: | + # Without min_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + ( + predict_linear(prometheus_notifications_queue_length{job=~"prometheus-k8s|prometheus-user-workload"}[5m], 60 * 30) + > + min_over_time(prometheus_notifications_queue_capacity{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) + ) + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusOutOfOrderTimestamps + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping + {{ printf "%.4g" $value }} samples/s with timestamps arriving out of + order. + summary: Prometheus drops samples with out-of-order timestamps. + syn_component: openshift4-monitoring + expr: | + rate(prometheus_target_scrapes_sample_out_of_order_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) > 0 + for: 1h + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusRemoteStorageFailures + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to + send {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ + $labels.url }} + summary: Prometheus fails to send samples to remote storage. + syn_component: openshift4-monitoring + expr: | + ( + (rate(prometheus_remote_storage_failed_samples_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m])) + / + ( + (rate(prometheus_remote_storage_failed_samples_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m])) + + + (rate(prometheus_remote_storage_succeeded_samples_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) or rate(prometheus_remote_storage_samples_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m])) + ) + ) + * 100 + > 1 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusRemoteWriteDesiredShards + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write + desired shards calculation wants to run {{ $value }} shards for queue + {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max + of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job=~"prometheus-k8s|prometheus-user-workload"}` + $labels.instance | query | first | value }}. + runbook_url: https://hub.syn.tools/openshift4-monitoring/runbooks/remotewrite.html + summary: Prometheus remote write desired shards calculation wants to run + more than configured max shards. + syn_component: openshift4-monitoring + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + ( + max_over_time(prometheus_remote_storage_shards_desired{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) + > + max_over_time(prometheus_remote_storage_shards_max{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) + ) + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusRuleFailures + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed + to evaluate {{ printf "%.0f" $value }} rules in the last 5m. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/PrometheusRuleFailures.md + summary: Prometheus is failing rule evaluations. + syn_component: openshift4-monitoring + expr: | + increase(prometheus_rule_evaluation_failures_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) > 0 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusSDRefreshFailure + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed + to refresh SD with mechanism {{$labels.mechanism}}. + summary: Failed Prometheus SD refresh. + syn_component: openshift4-monitoring + expr: | + increase(prometheus_sd_refresh_failures_total{job=~"prometheus-k8s|prometheus-user-workload"}[10m]) > 0 + for: 20m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusScrapeBodySizeLimitHit + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed + {{ printf "%.0f" $value }} scrapes in the last 5m because some targets + exceeded the configured body_size_limit. + summary: Prometheus has dropped some targets that exceeded body size limit. + syn_component: openshift4-monitoring + expr: | + increase(prometheus_target_scrapes_exceeded_body_size_limit_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) > 0 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusScrapeSampleLimitHit + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed + {{ printf "%.0f" $value }} scrapes in the last 5m because some targets + exceeded the configured sample_limit. + summary: Prometheus has failed scrapes that have exceeded the configured + sample limit. + syn_component: openshift4-monitoring + expr: | + increase(prometheus_target_scrapes_exceeded_sample_limit_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) > 0 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusTSDBCompactionsFailing + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected + {{$value | humanize}} compaction failures over the last 3h. + summary: Prometheus has issues compacting blocks. + syn_component: openshift4-monitoring + expr: | + increase(prometheus_tsdb_compactions_failed_total{job=~"prometheus-k8s|prometheus-user-workload"}[3h]) > 0 + for: 4h + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusTSDBReloadsFailing + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected + {{$value | humanize}} reload failures over the last 3h. + summary: Prometheus has issues reloading blocks from disk. + syn_component: openshift4-monitoring + expr: | + increase(prometheus_tsdb_reloads_failures_total{job=~"prometheus-k8s|prometheus-user-workload"}[3h]) > 0 + for: 4h + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusTargetLimitHit + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped + {{ printf "%.0f" $value }} targets because the number of targets exceeded + the configured target_limit. + summary: Prometheus has dropped targets because some scrape configs have + exceeded the targets limit. + syn_component: openshift4-monitoring + expr: | + increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) > 0 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusTargetSyncFailure + annotations: + description: '{{ printf "%.0f" $value }} targets in Prometheus {{$labels.namespace}}/{{$labels.pod}} + have failed to sync because invalid configuration was supplied.' + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/PrometheusTargetSyncFailure.md + summary: Prometheus has failed to sync targets. + syn_component: openshift4-monitoring + expr: | + increase(prometheus_target_sync_failed_total{job=~"prometheus-k8s|prometheus-user-workload"}[30m]) > 0 + for: 5m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-prometheus-operator + rules: + - alert: SYN_PrometheusOperatorNotReady + annotations: + description: Prometheus operator in {{ $labels.namespace }} namespace + isn't ready to reconcile {{ $labels.controller }} resources. + summary: Prometheus operator not ready + syn_component: openshift4-monitoring + expr: | + min by (controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]) == 0) + for: 5m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusOperatorReconcileErrors + annotations: + description: '{{ $value | humanizePercentage }} of reconciling operations + failed for {{ $labels.controller }} controller in {{ $labels.namespace + }} namespace.' + summary: Errors while reconciling objects. + syn_component: openshift4-monitoring + expr: | + (sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.1 + for: 10m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusOperatorRejectedResources + annotations: + description: Prometheus operator in {{ $labels.namespace }} namespace + rejected {{ printf "%0.0f" $value }} {{ $labels.controller }}/{{ $labels.resource + }} resources. + summary: Resources rejected by Prometheus operator + syn_component: openshift4-monitoring + expr: | + min_over_time(prometheus_operator_managed_resources{state="rejected",job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]) > 0 + for: 5m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusOperatorStatusUpdateErrors + annotations: + description: '{{ $value | humanizePercentage }} of status update operations + failed for {{ $labels.controller }} controller in {{ $labels.namespace + }} namespace.' + summary: Errors while updating objects status. + syn_component: openshift4-monitoring + expr: | + (sum by (controller,namespace) (rate(prometheus_operator_status_update_errors_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_status_update_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.1 + for: 10m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusOperatorSyncFailed + annotations: + description: Controller {{ $labels.controller }} in {{ $labels.namespace + }} namespace fails to reconcile {{ $value }} objects. + summary: Last controller reconciliation failed + syn_component: openshift4-monitoring + expr: | + min_over_time(prometheus_operator_syncs{status="failed",job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]) > 0 + for: 10m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_PrometheusOperatorWatchErrors + annotations: + description: Errors while performing watch operations in controller {{$labels.controller}} + in {{$labels.namespace}} namespace. + summary: Errors while performing watch operations in controller. + syn_component: openshift4-monitoring + expr: | + (sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]))) > 0.4 + for: 15m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-system-memory-exceeds-reservation + rules: + - alert: SYN_SystemMemoryExceedsReservation + annotations: + message: System memory usage of {{ $value | humanize }} on {{ $labels.node + }} exceeds 95% of the reservation. Reserved memory ensures system processes + can function even when the node is fully allocated and protects against + workload out of memory events impacting the proper functioning of the + node. The default reservation is expected to be sufficient for most + configurations and should be increased (https://docs.openshift.com/container-platform/latest/nodes/nodes/nodes-nodes-managing.html) + when running nodes with high numbers of pods (either due to rate of + change or at steady state). + syn_component: openshift4-monitoring + expr: | + sum by (node) (container_memory_rss{id="/system.slice"}) > ((sum by (node) (kube_node_status_capacity{resource="memory"} - kube_node_status_allocatable{resource="memory"})) * 0.95) + for: 15m + labels: + namespace: openshift-machine-config-operator + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-thanos-query + rules: + - alert: SYN_ThanosQueryHttpRequestQueryErrorRateHigh + annotations: + description: Thanos Query {{$labels.job}} in {{$labels.namespace}} is + failing to handle {{$value | humanize}}% of "query" requests. + summary: Thanos Query is failing to handle requests. + syn_component: openshift4-monitoring + expr: | + ( + sum by (namespace, job) (rate(http_requests_total{code=~"5..", job="thanos-querier", handler="query"}[5m])) + / + sum by (namespace, job) (rate(http_requests_total{job="thanos-querier", handler="query"}[5m])) + ) * 100 > 5 + for: 1h + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_ThanosQueryHttpRequestQueryRangeErrorRateHigh + annotations: + description: Thanos Query {{$labels.job}} in {{$labels.namespace}} is + failing to handle {{$value | humanize}}% of "query_range" requests. + summary: Thanos Query is failing to handle requests. + syn_component: openshift4-monitoring + expr: | + ( + sum by (namespace, job) (rate(http_requests_total{code=~"5..", job="thanos-querier", handler="query_range"}[5m])) + / + sum by (namespace, job) (rate(http_requests_total{job="thanos-querier", handler="query_range"}[5m])) + ) * 100 > 5 + for: 1h + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_ThanosQueryOverload + annotations: + description: Thanos Query {{$labels.job}} in {{$labels.namespace}} has + been overloaded for more than 15 minutes. This may be a symptom of excessive + simultanous complex requests, low performance of the Prometheus API, + or failures within these components. Assess the health of the Thanos + query instances, the connnected Prometheus instances, look for potential + senders of these requests and then contact support. + summary: Thanos query reaches its maximum capacity serving concurrent + requests. + syn_component: openshift4-monitoring + expr: | + ( + max_over_time(thanos_query_concurrent_gate_queries_max[5m]) - avg_over_time(thanos_query_concurrent_gate_queries_in_flight[5m]) < 1 + ) + for: 1h + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - name: syn-thanos-rule + rules: + - alert: SYN_ThanosNoRuleEvaluations + annotations: + description: Thanos Rule {{$labels.instance}} in {{$labels.namespace}} + did not perform any rule evaluations in the past 10 minutes. + summary: Thanos Rule did not perform any rule evaluations. + syn_component: openshift4-monitoring + expr: | + sum by (namespace, job, instance) (rate(prometheus_rule_evaluations_total{job="thanos-ruler"}[5m])) <= 0 + and + sum by (namespace, job, instance) (thanos_rule_loaded_rules{job="thanos-ruler"}) > 0 + for: 5m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_ThanosRuleGrpcErrorRate + annotations: + description: Thanos Rule {{$labels.job}} in {{$labels.namespace}} is failing + to handle {{$value | humanize}}% of requests. + summary: Thanos Rule is failing to handle grpc requests. + syn_component: openshift4-monitoring + expr: | + ( + sum by (namespace, job, instance) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job="thanos-ruler"}[5m])) + / + sum by (namespace, job, instance) (rate(grpc_server_started_total{job="thanos-ruler"}[5m])) + * 100 > 5 + ) + for: 5m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_ThanosRuleHighRuleEvaluationFailures + annotations: + description: Thanos Rule {{$labels.instance}} in {{$labels.namespace}} + is failing to evaluate rules. + summary: Thanos Rule is failing to evaluate rules. + syn_component: openshift4-monitoring + expr: | + ( + sum by (namespace, job, instance) (rate(prometheus_rule_evaluation_failures_total{job="thanos-ruler"}[5m])) + / + sum by (namespace, job, instance) (rate(prometheus_rule_evaluations_total{job="thanos-ruler"}[5m])) + * 100 > 5 + ) + for: 5m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_ThanosRuleQueueIsDroppingAlerts + annotations: + description: Thanos Rule {{$labels.instance}} in {{$labels.namespace}} + is failing to queue alerts. + runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ThanosRuleQueueIsDroppingAlerts.md + summary: Thanos Rule is failing to queue alerts. + syn_component: openshift4-monitoring + expr: | + sum by (namespace, job, instance) (rate(thanos_alert_queue_alerts_dropped_total{job="thanos-ruler"}[5m])) > 0 + for: 5m + labels: + severity: critical + syn: 'true' + syn_component: openshift4-monitoring + - alert: SYN_ThanosRuleSenderIsFailingAlerts + annotations: + description: Thanos Rule {{$labels.instance}} in {{$labels.namespace}} + is failing to send alerts to alertmanager. + summary: Thanos Rule is failing to send alerts to alertmanager. + syn_component: openshift4-monitoring + expr: | + sum by (namespace, job, instance) (rate(thanos_alert_sender_alerts_dropped_total{job="thanos-ruler"}[5m])) > 0 + for: 5m + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring diff --git a/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/rbac.yaml b/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/rbac.yaml new file mode 100644 index 00000000..1c6d4fea --- /dev/null +++ b/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/rbac.yaml @@ -0,0 +1,44 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + annotations: + syn_component: openshift4-monitoring + labels: + name: syn-prometheus-auto-discovery + name: syn-prometheus-auto-discovery +rules: + - apiGroups: + - '' + resources: + - pods + - services + - endpoints + verbs: + - get + - list + - watch + - apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: + - get + - list + - watch +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + annotations: + syn_component: openshift4-monitoring + labels: + name: syn-prometheus-auto-discovery + name: syn-prometheus-auto-discovery +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: syn-prometheus-auto-discovery +subjects: + - kind: ServiceAccount + name: prometheus-k8s + namespace: openshift-monitoring diff --git a/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/silence.yaml b/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/silence.yaml new file mode 100644 index 00000000..c3e45f77 --- /dev/null +++ b/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/silence.yaml @@ -0,0 +1,107 @@ +apiVersion: v1 +data: + silence: | + #!/bin/bash + set -euo pipefail + + curl_opts=( https://alertmanager-main.openshift-monitoring.svc.cluster.local:9095/api/v2/silences --cacert /etc/ssl/certs/serving-certs/service-ca.crt --header 'Content-Type: application/json' --header "Authorization: Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" --resolve "alertmanager-main.openshift-monitoring.svc.cluster.local:9095:$(getent hosts alertmanager-operated.openshift-monitoring.svc.cluster.local | awk '{print $1}' | head -n 1)" --silent ) + + while IFS= read -r silence; do + comment=$(printf %s "${silence}" | jq -r '.comment') + + body=$(printf %s "$silence" | \ + jq \ + --arg startsAt "$(date -u +'%Y-%m-%dT%H:%M:%S' --date '-1 min')" \ + --arg endsAt "$(date -u +'%Y-%m-%dT%H:%M:%S' --date '+1 day')" \ + --arg createdBy "Kubernetes object \`cronjob/silence\` in the monitoring namespace" \ + '.startsAt = $startsAt | .endsAt = $endsAt | .createdBy = $createdBy' + ) + + id=$(curl "${curl_opts[@]}" | jq -r ".[] | select(.status.state == \"active\") | select(.comment == \"${comment}\") | .id" | head -n 1) + if [ -n "${id}" ]; then + body=$(printf %s "${body}" | jq --arg id "${id}" '.id = $id') + fi + + curl "${curl_opts[@]}" -XPOST -d "${body}" + done <<<"$(printf %s "${SILENCES_JSON}" | jq -cr '.[]')" + silences.json: '[{"comment":"Silence non syn alerts","matchers":[{"isRegex":true,"name":"alertname","value":".+"},{"isRegex":false,"name":"syn","value":""}]}]' +kind: ConfigMap +metadata: + annotations: {} + labels: + name: silence + name: silence + namespace: openshift-monitoring +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + annotations: {} + labels: + name: silence + name: silence + namespace: openshift-monitoring +spec: + concurrencyPolicy: Forbid + failedJobsHistoryLimit: 3 + jobTemplate: + spec: + completions: 1 + parallelism: 1 + template: + metadata: + labels: + name: silence + spec: + containers: + - args: [] + command: + - /usr/local/bin/silence + env: + - name: SILENCES_JSON + valueFrom: + configMapKeyRef: + key: silences.json + name: silence + image: quay.io/appuio/oc:v4.14 + imagePullPolicy: IfNotPresent + name: silence + ports: [] + stdin: false + tty: false + volumeMounts: + - mountPath: /etc/ssl/certs/serving-certs/ + name: ca-bundle + readOnly: true + - mountPath: /var/run/secrets/kubernetes.io/serviceaccount + name: kube-api-access + readOnly: true + - mountPath: /usr/local/bin/silence + name: scripts + readOnly: true + subPath: silence + imagePullSecrets: [] + initContainers: [] + nodeSelector: + node-role.kubernetes.io/infra: '' + restartPolicy: Never + serviceAccountName: prometheus-k8s + terminationGracePeriodSeconds: 30 + volumes: + - configMap: + defaultMode: 288 + name: serving-certs-ca-bundle + name: ca-bundle + - name: kube-api-access + projected: + defaultMode: 420 + sources: + - serviceAccountToken: + expirationSeconds: 3607 + path: token + - configMap: + defaultMode: 360 + name: silence + name: scripts + schedule: 0 */4 * * * + successfulJobsHistoryLimit: 3 diff --git a/tests/ovn-kubernetes.yml b/tests/ovn-kubernetes.yml new file mode 100644 index 00000000..caf0b0ab --- /dev/null +++ b/tests/ovn-kubernetes.yml @@ -0,0 +1,15 @@ +parameters: + kapitan: + dependencies: + - type: https + source: https://raw.githubusercontent.com/projectsyn/component-patch-operator/master/lib/patch-operator.libsonnet + output_path: vendor/lib/patch-operator.libsonnet + + patch_operator: + namespace: syn-patch-operator + patch_serviceaccount: + name: patch-sa + + openshift4_monitoring: + upstreamRules: + networkPlugin: ovn-kubernetes