Skip to content

Commit

Permalink
Add support for OCP 4.16
Browse files Browse the repository at this point in the history
  • Loading branch information
Debakel Orakel committed Oct 15, 2024
1 parent 4d39bbd commit eb521f6
Show file tree
Hide file tree
Showing 27 changed files with 3,356 additions and 107 deletions.
4 changes: 2 additions & 2 deletions class/defaults.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ parameters:
manifests_version: release-4.15
# no release branches newer than 4.9 exist
=_operator_lifecycle_manager_map:
release-4.13: release-4.9
release-4.14: release-4.9
release-4.15: release-4.9
release-4.16: release-4.9
jsonnetfile_parameters:
cmo_version: ${openshift4_monitoring:manifests_version}
etcd_version: ${openshift4_monitoring:manifests_version}
Expand Down Expand Up @@ -205,9 +205,9 @@ parameters:
NodeMemoryMajorPagesFaults:
# Only alert for >100*cores major page faults/node instead of >500/node
expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) > on (instance) (count by (instance) (node_cpu_info{}) * 100)
release-4.13: {}
release-4.14: {}
release-4.15: {}
release-4.16: {}
# Alerts to ignore for user workload monitoring
ignoreUserWorkload: []

Expand Down
34 changes: 21 additions & 13 deletions class/openshift4-monitoring.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,6 @@ parameters:
openshift4_monitoring:
=_manifest_urls:
kube-apiserver:
release-4.13:
api-usage: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.13/bindata/assets/alerts/api-usage.yaml
cpu-utilization: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.13/bindata/assets/alerts/cpu-utilization.yaml
slos: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.13/bindata/assets/alerts/kube-apiserver-slos-basic.yaml
release-4.14:
api-usage: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.14/bindata/assets/alerts/api-usage.yaml
cpu-utilization: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.14/bindata/assets/alerts/cpu-utilization.yaml
Expand All @@ -14,21 +10,28 @@ parameters:
api-usage: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.15/bindata/assets/alerts/api-usage.yaml
cpu-utilization: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.15/bindata/assets/alerts/cpu-utilization.yaml
slos: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.15/bindata/assets/alerts/kube-apiserver-slos-basic.yaml
release-4.16:
api-usage: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.16/bindata/assets/alerts/api-usage.yaml
cpu-utilization: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.16/bindata/assets/alerts/cpu-utilization.yaml
slos: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.16/bindata/assets/alerts/kube-apiserver-slos-basic.yaml

machine-api-operator:
release-4.13:
prometheus: https://raw.githubusercontent.com/openshift/machine-api-operator/release-4.13/install/0000_90_machine-api-operator_04_alertrules.yaml
release-4.14:
prometheus: https://raw.githubusercontent.com/openshift/machine-api-operator/release-4.14/install/0000_90_machine-api-operator_04_alertrules.yaml
release-4.15:
prometheus: https://raw.githubusercontent.com/openshift/machine-api-operator/release-4.15/install/0000_90_machine-api-operator_04_alertrules.yaml
release-4.16:
prometheus: https://raw.githubusercontent.com/openshift/machine-api-operator/release-4.16/install/0000_90_machine-api-operator_04_alertrules.yaml

machine-config-operator:
release-4.14:
prometheus: https://raw.githubusercontent.com/openshift/machine-config-operator/release-4.14/install/0000_90_machine-config-operator_01_prometheus-rules.yaml
release-4.15:
prometheus: https://raw.githubusercontent.com/openshift/machine-config-operator/release-4.15/install/0000_90_machine-config-operator_01_prometheus-rules.yaml
release-4.16:
prometheus: https://raw.githubusercontent.com/openshift/machine-config-operator/release-4.16/install/0000_90_machine-config_01_prometheus-rules.yaml

ovn-kubernetes:
release-4.13:
common: https://raw.githubusercontent.com/openshift/cluster-network-operator/${openshift4_monitoring:manifests_version}/bindata/network/ovn-kubernetes/common/alert-rules.yaml
# We use the "self-hosted" variant of the control-plane alerts, so
# we don't have to worry about unresolved gotemplate references.
control_plane: https://raw.githubusercontent.com/openshift/cluster-network-operator/${openshift4_monitoring:manifests_version}/bindata/network/ovn-kubernetes/self-hosted/alert-rules-control-plane.yaml
release-4.14:
common: https://raw.githubusercontent.com/openshift/cluster-network-operator/${openshift4_monitoring:manifests_version}/bindata/network/ovn-kubernetes/common/alert-rules.yaml
# We handle the gotemplate stuff in Jsonnet for now, since Jinja
Expand All @@ -47,11 +50,16 @@ parameters:
# The only templates that are in the alerting rules can be handled
# with a simple string replace.
control_plane: https://raw.githubusercontent.com/openshift/cluster-network-operator/release-4.15/bindata/network/ovn-kubernetes/self-hosted/alert-rules-control-plane.yaml
release-4.16:
common: https://raw.githubusercontent.com/openshift/cluster-network-operator/${openshift4_monitoring:manifests_version}/bindata/network/ovn-kubernetes/common/alert-rules.yaml
# We use the "self-hosted" variant of the control-plane alerts, so
# we don't have to worry about unresolved gotemplate references.
control_plane: https://raw.githubusercontent.com/openshift/cluster-network-operator/${openshift4_monitoring:manifests_version}/bindata/network/ovn-kubernetes/self-hosted/alert-rules-control-plane.yaml

cloud-credential-operator:
release-4.13: https://raw.githubusercontent.com/openshift/cloud-credential-operator/release-4.13/manifests/0000_90_cloud-credential-operator_04_alertrules.yaml
release-4.14: https://raw.githubusercontent.com/openshift/cloud-credential-operator/release-4.14/manifests/0000_90_cloud-credential-operator_04_alertrules.yaml
release-4.15: https://raw.githubusercontent.com/openshift/cloud-credential-operator/release-4.15/manifests/0000_90_cloud-credential-operator_03_alertrules.yaml
release-4.16: https://raw.githubusercontent.com/openshift/cloud-credential-operator/release-4.16/manifests/0000_90_cloud-credential-operator_03_alertrules.yaml


kapitan:
Expand Down Expand Up @@ -103,7 +111,7 @@ parameters:
source: ${openshift4_monitoring:_manifest_urls:machine-api-operator:${openshift4_monitoring:manifests_version}:prometheus}
output_path: dependencies/openshift4-monitoring/manifests/${openshift4_monitoring:manifests_version}/machine-api-operator.yaml
- type: https
source: https://raw.githubusercontent.com/openshift/machine-config-operator/${openshift4_monitoring:manifests_version}/install/0000_90_machine-config-operator_01_prometheus-rules.yaml
source: ${openshift4_monitoring:_manifest_urls:machine-config-operator:${openshift4_monitoring:manifests_version}:prometheus}
output_path: dependencies/openshift4-monitoring/manifests/${openshift4_monitoring:manifests_version}/machine-config-operator.yaml
- type: https
source: https://raw.githubusercontent.com/operator-framework/operator-lifecycle-manager/${openshift4_monitoring:_operator_lifecycle_manager_map:${openshift4_monitoring:manifests_version}}/manifests/0000_90_olm_01-prometheus-rule.yaml
Expand Down
4 changes: 2 additions & 2 deletions tests/custom-rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ parameters:
name: patch-sa

openshift4_monitoring:
manifests_version: release-4.13
manifests_version: release-4.16

customNodeExporter:
enabled: true
Expand All @@ -25,7 +25,7 @@ parameters:
labels:
foo: foo
generic: patch
release-4.13:
release-4.16:
HighOverallControlPlaneMemory:
labels:
foo: bar
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -852,7 +852,7 @@ spec:
syn_component: openshift4-monitoring
expr: |
(
max by(namespace, statefulset) (
max by(namespace, statefulset, job, cluster) (
kube_statefulset_status_current_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}
unless
kube_statefulset_status_update_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}
Expand Down Expand Up @@ -1040,7 +1040,7 @@ spec:
}} of its incoming requests.
syn_component: openshift4-monitoring
expr: |
sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20
sum by(cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum by(cluster) (rate(apiserver_request_total{job="apiserver"}[10m])) + sum by(cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20
for: 5m
labels:
severity: warning
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -852,7 +852,7 @@ spec:
syn_component: openshift4-monitoring
expr: |
(
max by(namespace, statefulset) (
max by(namespace, statefulset, job, cluster) (
kube_statefulset_status_current_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}
unless
kube_statefulset_status_update_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}
Expand Down Expand Up @@ -1040,7 +1040,7 @@ spec:
}} of its incoming requests.
syn_component: openshift4-monitoring
expr: |
sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20
sum by(cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum by(cluster) (rate(apiserver_request_total{job="apiserver"}[10m])) + sum by(cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20
for: 5m
labels:
severity: warning
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ metadata:
labels:
app.kubernetes.io/part-of: openshift4-monitoring
name: appuio-node-exporter
namespace: openshift-monitoring
rules:
- apiGroups:
- authentication.k8s.io
Expand Down Expand Up @@ -33,7 +32,6 @@ metadata:
labels:
app.kubernetes.io/part-of: openshift4-monitoring
name: appuio-node-exporter
namespace: openshift-monitoring
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
Expand All @@ -59,6 +57,7 @@ spec:
metadata:
annotations:
kubectl.kubernetes.io/default-container: appuio-node-exporter
openshift.io/required-scc: node-exporter
labels:
app.kubernetes.io/managed-by: cluster-monitoring-operator
app.kubernetes.io/part-of: openshift4-monitoring
Expand Down Expand Up @@ -152,6 +151,9 @@ spec:
fi
echo "ts=$(date -Iseconds) num_cpus=$NUM_CPUS gomaxprocs=$GOMAXPROCS"
exec /bin/node_exporter "$0" "$@"
env:
- name: DBUS_SYSTEM_BUS_ADDRESS
value: unix:path=/host/root/var/run/dbus/system_bus_socket
image: quay.io/prometheus/node-exporter:v1.8.2
name: appuio-node-exporter
resources:
Expand All @@ -162,7 +164,6 @@ spec:
cpu: 8m
memory: 32Mi
securityContext: {}
terminationMessagePolicy: FallbackToLogsOnError
volumeMounts:
- mountPath: /host/sys
mountPropagation: HostToContainer
Expand All @@ -177,7 +178,6 @@ spec:
readOnly: true
workingDir: /var/node_exporter/textfile
- args:
- --logtostderr
- --secure-listen-address=[$(IP)]:9199
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305
- --upstream=http://127.0.0.1:9199/
Expand Down Expand Up @@ -209,7 +209,8 @@ spec:
runAsGroup: 65532
runAsNonRoot: true
runAsUser: 65532
terminationMessagePolicy: FallbackToLogsOnError
seccompProfile:
type: RuntimeDefault
volumeMounts:
- mountPath: /etc/tls/private
name: node-exporter-tls
Expand Down Expand Up @@ -240,7 +241,6 @@ spec:
securityContext:
privileged: true
runAsUser: 0
terminationMessagePolicy: FallbackToLogsOnError
volumeMounts:
- mountPath: /var/node_exporter/textfile
name: node-exporter-textfile
Expand Down Expand Up @@ -306,6 +306,8 @@ apiVersion: v1
kind: Service
metadata:
annotations:
openshift.io/description: Expose the `/metrics` endpoint on port 9199. This port
is for internal use, and no other usage is guaranteed.
service.beta.openshift.io/serving-cert-secret-name: appuio-node-exporter-tls
labels:
app.kubernetes.io/part-of: openshift4-monitoring
Expand Down
Loading

0 comments on commit eb521f6

Please sign in to comment.