Merge pull request #207 from appuio/ocp-4.15-support

Add support for OCP 4.15
appuio · Jul 24, 2024 · 0285f58 · 0285f58
2 parents 24f7fe7 + a572f87
commit 0285f58
Show file tree

Hide file tree

Showing 40 changed files with 3,362 additions and 127 deletions.
diff --git a/.cruft.json b/.cruft.json
@@ -1,13 +1,13 @@
 {
   "template": "https://github.com/projectsyn/commodore-component-template.git",
-  "commit": "26ee71e475cca036551c68a6c6b2285fe86139a0",
+  "commit": "2ae1bc3383f211eee5f20a963f5ac74725d85d5b",
   "checkout": "main",
   "context": {
     "cookiecutter": {
       "name": "OpenShift4 Monitoring",
       "slug": "openshift4-monitoring",
       "parameter_key": "openshift4_monitoring",
-      "test_cases": "capacity-alerts remote-write user-workload-monitoring capacity-alerts-with-node-labels vsphere custom-rules release-4.13 team-routing release-4.14 ovn-kubernetes",
+      "test_cases": "capacity-alerts remote-write user-workload-monitoring capacity-alerts-with-node-labels vsphere custom-rules release-4.13 team-routing release-4.14 ovn-kubernetes release-4.15",
       "add_lib": "y",
       "add_pp": "n",
       "add_golden": "y",

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -42,6 +42,7 @@ jobs:
           - team-routing
           - release-4.14
           - ovn-kubernetes
+          - release-4.15
     defaults:
       run:
         working-directory: ${{ env.COMPONENT_NAME }}
@@ -66,6 +67,7 @@ jobs:
           - team-routing
           - release-4.14
           - ovn-kubernetes
+          - release-4.15
     defaults:
       run:
         working-directory: ${{ env.COMPONENT_NAME }}

diff --git a/Makefile.vars.mk b/Makefile.vars.mk
@@ -57,4 +57,4 @@ KUBENT_IMAGE    ?= ghcr.io/doitintl/kube-no-trouble:latest
 KUBENT_DOCKER   ?= $(DOCKER_CMD) $(DOCKER_ARGS) $(root_volume) --entrypoint=/app/kubent $(KUBENT_IMAGE)
 
 instance ?= capacity-alerts
-test_instances = tests/capacity-alerts.yml tests/remote-write.yml tests/user-workload-monitoring.yml tests/capacity-alerts-with-node-labels.yml tests/vsphere.yml tests/custom-rules.yml tests/release-4.13.yml tests/team-routing.yml tests/release-4.14.yml tests/ovn-kubernetes.yml
+test_instances = tests/capacity-alerts.yml tests/remote-write.yml tests/user-workload-monitoring.yml tests/capacity-alerts-with-node-labels.yml tests/vsphere.yml tests/custom-rules.yml tests/release-4.13.yml tests/team-routing.yml tests/release-4.14.yml tests/ovn-kubernetes.yml tests/release-4.15.yml
diff --git a/class/defaults.yml b/class/defaults.yml
@@ -7,21 +7,15 @@ parameters:
         prom.libsonnet: openshift4-monitoring-prom.libsonnet
         alert-patching.libsonnet: openshift4-monitoring-alert-patching.libsonnet
     namespace: openshift-monitoring
-    # TODO: select based on reported OCP version once we have dynamic facts
-    manifests_version: release-4.14
-    =_cluster_monitoring_operator_version_map:
-      release-4.13: release-4.13
-      release-4.14: release-4.14
-    =_etcd_operator_version_map:
-      release-4.13: release-4.13
-      release-4.14: release-4.14
+    manifests_version: release-4.15
     # no release branches newer than 4.9 exist
     =_operator_lifecycle_manager_map:
       release-4.13: release-4.9
       release-4.14: release-4.9
+      release-4.15: release-4.9
     jsonnetfile_parameters:
-      cmo_version: ${openshift4_monitoring:_cluster_monitoring_operator_version_map:${openshift4_monitoring:manifests_version}}
-      etcd_version: ${openshift4_monitoring:_etcd_operator_version_map:${openshift4_monitoring:manifests_version}}
+      cmo_version: ${openshift4_monitoring:manifests_version}
+      etcd_version: ${openshift4_monitoring:manifests_version}
     defaultConfig:
       nodeSelector:
         node-role.kubernetes.io/infra: ''
@@ -211,6 +205,7 @@ parameters:
             expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) > on (instance) (count by (instance) (node_cpu_info{}) * 100)
         release-4.13: {}
         release-4.14: {}
+        release-4.15: {}
       # Alerts to ignore for user workload monitoring
       ignoreUserWorkload: []
 
@@ -237,7 +232,7 @@ parameters:
     images:
       oc:
         image: quay.io/appuio/oc
-        tag: v4.14
+        tag: v4.15
       node_exporter:
         registry: quay.io
         repository: prometheus/node-exporter

diff --git a/class/openshift4-monitoring.yml b/class/openshift4-monitoring.yml
@@ -2,14 +2,6 @@ parameters:
   openshift4_monitoring:
     =_manifest_urls:
       kube-apiserver:
-        release-4.11:
-          api-usage: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.11/bindata/assets/alerts/api-usage.yaml
-          cpu-utilization: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.11/bindata/assets/alerts/cpu-utilization.yaml
-          slos: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.11/bindata/assets/alerts/kube-apiserver-slos-basic.yaml
-        release-4.12:
-          api-usage: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.12/bindata/assets/alerts/api-usage.yaml
-          cpu-utilization: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.12/bindata/assets/alerts/cpu-utilization.yaml
-          slos: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.12/bindata/assets/alerts/kube-apiserver-slos-basic.yaml
         release-4.13:
           api-usage: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.13/bindata/assets/alerts/api-usage.yaml
           cpu-utilization: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.13/bindata/assets/alerts/cpu-utilization.yaml
@@ -18,28 +10,20 @@ parameters:
           api-usage: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.14/bindata/assets/alerts/api-usage.yaml
           cpu-utilization: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.14/bindata/assets/alerts/cpu-utilization.yaml
           slos: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.14/bindata/assets/alerts/kube-apiserver-slos-basic.yaml
+        release-4.15:
+          api-usage: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.15/bindata/assets/alerts/api-usage.yaml
+          cpu-utilization: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.15/bindata/assets/alerts/cpu-utilization.yaml
+          slos: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.15/bindata/assets/alerts/kube-apiserver-slos-basic.yaml
 
       machine-api-operator:
-        release-4.11:
-          prometheus: https://raw.githubusercontent.com/openshift/machine-api-operator/release-4.11/install/0000_90_machine-api-operator_04_alertrules.yaml
-        release-4.12:
-          prometheus: https://raw.githubusercontent.com/openshift/machine-api-operator/release-4.12/install/0000_90_machine-api-operator_04_alertrules.yaml
         release-4.13:
           prometheus: https://raw.githubusercontent.com/openshift/machine-api-operator/release-4.13/install/0000_90_machine-api-operator_04_alertrules.yaml
         release-4.14:
           prometheus: https://raw.githubusercontent.com/openshift/machine-api-operator/release-4.14/install/0000_90_machine-api-operator_04_alertrules.yaml
+        release-4.15:
+          prometheus: https://raw.githubusercontent.com/openshift/machine-api-operator/release-4.15/install/0000_90_machine-api-operator_04_alertrules.yaml
 
       ovn-kubernetes:
-        release-4.11:
-          common: https://raw.githubusercontent.com/openshift/cluster-network-operator/${openshift4_monitoring:manifests_version}/bindata/network/ovn-kubernetes/common/alert-rules.yaml
-          # We use the "self-hosted" variant of the control-plane alerts, so
-          # we don't have to worry about unresolved gotemplate references.
-          control_plane: https://raw.githubusercontent.com/openshift/cluster-network-operator/${openshift4_monitoring:manifests_version}/bindata/network/ovn-kubernetes/self-hosted/alert-rules-control-plane.yaml
-        release-4.12:
-          common: https://raw.githubusercontent.com/openshift/cluster-network-operator/${openshift4_monitoring:manifests_version}/bindata/network/ovn-kubernetes/common/alert-rules.yaml
-          # We use the "self-hosted" variant of the control-plane alerts, so
-          # we don't have to worry about unresolved gotemplate references.
-          control_plane: https://raw.githubusercontent.com/openshift/cluster-network-operator/${openshift4_monitoring:manifests_version}/bindata/network/ovn-kubernetes/self-hosted/alert-rules-control-plane.yaml
         release-4.13:
           common: https://raw.githubusercontent.com/openshift/cluster-network-operator/${openshift4_monitoring:manifests_version}/bindata/network/ovn-kubernetes/common/alert-rules.yaml
           # We use the "self-hosted" variant of the control-plane alerts, so
@@ -56,11 +40,24 @@ parameters:
           # when selecting OVNKubernetes as the network plugin during
           # installation.
           control_plane: https://raw.githubusercontent.com/openshift/cluster-network-operator/release-4.14/bindata/network/ovn-kubernetes/self-hosted/multi-zone-interconnect/alert-rules-control-plane.yaml
+        release-4.15:
+          common: https://raw.githubusercontent.com/openshift/cluster-network-operator/${openshift4_monitoring:manifests_version}/bindata/network/ovn-kubernetes/common/alert-rules.yaml
+          # We handle the gotemplate stuff in Jsonnet for now, since Jinja
+          # can't deal with gotemplate expressions like `{{.OvnkubeMasterReplicas}}`.
+          # The only templates that are in the alerting rules can be handled
+          # with a simple string replace.
+          control_plane: https://raw.githubusercontent.com/openshift/cluster-network-operator/release-4.15/bindata/network/ovn-kubernetes/self-hosted/alert-rules-control-plane.yaml
+
+      cloud-credential-operator:
+        release-4.13: https://raw.githubusercontent.com/openshift/cloud-credential-operator/release-4.13/manifests/0000_90_cloud-credential-operator_04_alertrules.yaml
+        release-4.14: https://raw.githubusercontent.com/openshift/cloud-credential-operator/release-4.14/manifests/0000_90_cloud-credential-operator_04_alertrules.yaml
+        release-4.15: https://raw.githubusercontent.com/openshift/cloud-credential-operator/release-4.15/manifests/0000_90_cloud-credential-operator_03_alertrules.yaml
+
 
   kapitan:
     dependencies:
       - type: https
-        source: https://raw.githubusercontent.com/openshift/cloud-credential-operator/${openshift4_monitoring:manifests_version}/manifests/0000_90_cloud-credential-operator_04_alertrules.yaml
+        source: ${openshift4_monitoring:_manifest_urls:cloud-credential-operator:${openshift4_monitoring:manifests_version}}
         output_path: dependencies/openshift4-monitoring/manifests/${openshift4_monitoring:manifests_version}/cloud-credential-operator.yaml
       # Download cluster-version-operator rules YAML to folder
       # `manifests_requiring_prerendering/`, because we cannot prerender

diff --git a/...alerts-with-node-labels/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/...alerts-with-node-labels/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml
@@ -221,6 +221,25 @@ spec:
             syn_component: openshift4-monitoring
     - name: syn-cluster-operators
       rules:
+        - alert: SYN_CannotEvaluateConditionalUpdates
+          annotations:
+            description: Failure to evaluate conditional update matches means that
+              Cluster Version Operator cannot decide whether an update path is recommended
+              or not.
+            summary: Cluster Version Operator cannot evaluate conditional update matches
+              for {{ $value | humanizeDuration }}.
+            syn_component: openshift4-monitoring
+          expr: |
+            max by (version, condition, status, reason)
+            (
+              (
+                time()-cluster_version_conditional_update_condition_seconds{condition="Recommended", status="Unknown"}
+              ) >= 3600
+            )
+          labels:
+            severity: warning
+            syn: 'true'
+            syn_component: openshift4-monitoring
         - alert: SYN_ClusterOperatorDegraded
           annotations:
             description: The {{ $labels.name }} operator is degraded because {{ $labels.reason
@@ -392,13 +411,17 @@ spec:
             syn_component: openshift4-monitoring
         - alert: SYN_HighOverallControlPlaneCPU
           annotations:
-            description: |-
-              On a multi-node cluster with three control plane nodes, the overall CPU utilization may only be about 2/3 of all available capacity. This is because if a single control plane node fails, the remaining two must handle the load of the cluster in order to be HA. If the cluster is using more than 2/3 of all capacity, if one control plane node fails, the remaining two are likely to fail when they take the load. To fix this, increase the CPU and memory on your control plane nodes.
-              On a single node OpenShift (SNO) cluster, this alert will also fire if the 2/3 of the CPU cores of the node are in use by any workload. This level of CPU utlization of an SNO cluster is probably not a problem under most circumstances, but high levels of utilization may result in degraded performance. To manage this alert or silence it in case of false positives see the following link:  https://docs.openshift.com/container-platform/latest/monitoring/managing-alerts.html
+            description: Given three control plane nodes, the overall CPU utilization
+              may only be about 2/3 of all available capacity. This is because if
+              a single control plane node fails, the remaining two must handle the
+              load of the cluster in order to be HA. If the cluster is using more
+              than 2/3 of all capacity, if one control plane node fails, the remaining
+              two are likely to fail when they take the load. To fix this, increase
+              the CPU and memory on your control plane nodes.
             runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-apiserver-operator/ExtremelyHighIndividualControlPlaneCPU.md
-            summary: CPU utilization across all control plane nodes is more than 60%
-              of the total available CPU. Control plane node outage may cause a cascading
-              failure; increase available CPU.
+            summary: CPU utilization across all three control plane nodes is higher
+              than two control plane nodes can sustain; a single control plane node
+              outage may cause a cascading failure; increase available CPU.
             syn_component: openshift4-monitoring
           expr: |
             sum(
@@ -426,7 +449,7 @@ spec:
             summary: etcd cluster database is running full.
             syn_component: openshift4-monitoring
           expr: |
-            (last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]) / last_over_time(etcd_server_quota_backend_bytes[5m]))*100 > 95
+            (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 > 95
           for: 10m
           labels:
             severity: critical
@@ -829,7 +852,7 @@ spec:
             syn_component: openshift4-monitoring
           expr: |
             (
-              max without (revision) (
+              max by(namespace, statefulset) (
                 kube_statefulset_status_current_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}
                   unless
                 kube_statefulset_status_update_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}
@@ -1352,10 +1375,12 @@ spec:
             runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeClockNotSynchronising.md
             summary: Clock not synchronising.
             syn_component: openshift4-monitoring
-          expr: |
+          expr: |-
+            (
             min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
             and
             node_timex_maxerror_seconds{job="node-exporter"} >= 16
+            ) and on() absent(up{job="ptp-monitor-service"})
           for: 10m
           labels:
             severity: critical
@@ -1367,7 +1392,8 @@ spec:
               0.05s. Ensure NTP is configured correctly on this host.
             summary: Clock skew detected.
             syn_component: openshift4-monitoring
-          expr: |
+          expr: |-
+            (
             (
               node_timex_offset_seconds{job="node-exporter"} > 0.05
             and
@@ -1379,6 +1405,7 @@ spec:
             and
               deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
             )
+            ) and on() absent(up{job="ptp-monitor-service"})
           for: 10m
           labels:
             severity: warning

diff --git a/...capacity-alerts-with-node-labels/openshift4-monitoring/openshift4-monitoring/silence.yaml b/...capacity-alerts-with-node-labels/openshift4-monitoring/openshift4-monitoring/silence.yaml
@@ -63,7 +63,7 @@ spec:
                     configMapKeyRef:
                       key: silences.json
                       name: silence
-              image: quay.io/appuio/oc:v4.14
+              image: quay.io/appuio/oc:v4.15
               imagePullPolicy: IfNotPresent
               name: silence
               ports: []