From 9e8082b4e49b7504ad2f3efbfc611ff46c8b50c1 Mon Sep 17 00:00:00 2001 From: Simon Gerber Date: Wed, 11 Dec 2024 17:28:32 +0100 Subject: [PATCH] Allow traffic from cluster nodes to all workloads in the monitoring namespace --- component/networkpolicy.libsonnet | 32 ++++++++++++++++++- .../prometheus_rules.yaml | 9 +++--- .../prometheus_rules.yaml | 9 +++--- .../prometheus_rules.yaml | 9 +++--- .../prometheus_rules.yaml | 9 +++--- .../prometheus_rules.yaml | 9 +++--- .../prometheus_rules.yaml | 9 +++--- .../prometheus_rules.yaml | 9 +++--- .../prometheus_rules.yaml | 9 +++--- .../prometheus_rules.yaml | 9 +++--- .../20_networkpolicy.yaml | 17 ++++++++++ .../20_user_workload_networkpolicy.yaml | 17 ++++++++++ .../prometheus_rules.yaml | 9 +++--- .../prometheus_rules.yaml | 9 +++--- tests/user-workload-monitoring.yml | 2 ++ 15 files changed, 122 insertions(+), 45 deletions(-) diff --git a/component/networkpolicy.libsonnet b/component/networkpolicy.libsonnet index 43ce83f7..1b86010b 100644 --- a/component/networkpolicy.libsonnet +++ b/component/networkpolicy.libsonnet @@ -5,6 +5,8 @@ local kube = import 'lib/kube.libjsonnet'; local inv = kap.inventory(); local params = inv.parameters.openshift4_monitoring; +local cilium_cluster = std.member(inv.applications, 'cilium'); + [ kube.NetworkPolicy('alertmanager-allow-web') { spec: { @@ -82,4 +84,32 @@ local params = inv.parameters.openshift4_monitoring; }, }, }, -] +] + if cilium_cluster then [ + // allow all traffic from the cluster nodes, so that the HAproxy ingress can + // do healthchecks for routes in the openshift-monitoring namespace. + { + apiVersion: 'cilium.io/v2', + kind: 'CiliumNetworkPolicy', + metadata: { + annotations: { + 'syn.tools/description': ||| + Note that this policy isn't named allow-from-cluster-nodes, even + though its content is identical to ensure that Espejo doesn't delete + the policy. + |||, + }, + name: 'allow-from-cluster-nodes-custom', + }, + spec: { + endpointSelector: {}, + ingress: [ + { + fromEntities: [ + 'host', + 'remote-node', + ], + }, + ], + }, + }, +] else [] diff --git a/tests/golden/capacity-alerts-with-node-labels/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/capacity-alerts-with-node-labels/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index 1df74991..ee419e5e 100644 --- a/tests/golden/capacity-alerts-with-node-labels/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/capacity-alerts-with-node-labels/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -1077,13 +1077,14 @@ spec: syn_component: openshift4-monitoring - alert: SYN_KubeAggregatedAPIErrors annotations: - description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace - }} has reported errors. It has appeared unavailable {{ $value | humanize - }} times averaged over the past 10m. + description: Kubernetes aggregated API {{ $labels.instance }}/{{ $labels.name + }} has reported {{ $labels.reason }} errors on cluster {{ $labels.cluster + }}. summary: Kubernetes aggregated API has reported errors. syn_component: openshift4-monitoring expr: | - sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total{job="apiserver"}[10m])) > 4 + sum by(cluster, instance, name, reason)(increase(aggregator_unavailable_apiservice_total{job="apiserver"}[1m])) > 0 + for: 10m labels: severity: warning syn: 'true' diff --git a/tests/golden/capacity-alerts/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/capacity-alerts/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index 1df74991..ee419e5e 100644 --- a/tests/golden/capacity-alerts/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/capacity-alerts/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -1077,13 +1077,14 @@ spec: syn_component: openshift4-monitoring - alert: SYN_KubeAggregatedAPIErrors annotations: - description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace - }} has reported errors. It has appeared unavailable {{ $value | humanize - }} times averaged over the past 10m. + description: Kubernetes aggregated API {{ $labels.instance }}/{{ $labels.name + }} has reported {{ $labels.reason }} errors on cluster {{ $labels.cluster + }}. summary: Kubernetes aggregated API has reported errors. syn_component: openshift4-monitoring expr: | - sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total{job="apiserver"}[10m])) > 4 + sum by(cluster, instance, name, reason)(increase(aggregator_unavailable_apiservice_total{job="apiserver"}[1m])) > 0 + for: 10m labels: severity: warning syn: 'true' diff --git a/tests/golden/custom-rules/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/custom-rules/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index 64aef6c0..38d4328e 100644 --- a/tests/golden/custom-rules/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/custom-rules/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -1086,14 +1086,15 @@ spec: syn_component: openshift4-monitoring - alert: SYN_KubeAggregatedAPIErrors annotations: - description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace - }} has reported errors. It has appeared unavailable {{ $value | humanize - }} times averaged over the past 10m. + description: Kubernetes aggregated API {{ $labels.instance }}/{{ $labels.name + }} has reported {{ $labels.reason }} errors on cluster {{ $labels.cluster + }}. runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubeAggregatedAPIErrors.md summary: Kubernetes aggregated API has reported errors. syn_component: openshift4-monitoring expr: | - sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total{job="apiserver"}[10m])) > 4 + sum by(cluster, instance, name, reason)(increase(aggregator_unavailable_apiservice_total{job="apiserver"}[1m])) > 0 + for: 10m labels: severity: warning syn: 'true' diff --git a/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index 6abfa06f..04b636b0 100644 --- a/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -1228,13 +1228,14 @@ spec: syn_component: openshift4-monitoring - alert: SYN_KubeAggregatedAPIErrors annotations: - description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace - }} has reported errors. It has appeared unavailable {{ $value | humanize - }} times averaged over the past 10m. + description: Kubernetes aggregated API {{ $labels.instance }}/{{ $labels.name + }} has reported {{ $labels.reason }} errors on cluster {{ $labels.cluster + }}. summary: Kubernetes aggregated API has reported errors. syn_component: openshift4-monitoring expr: | - sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total{job="apiserver"}[10m])) > 4 + sum by(cluster, instance, name, reason)(increase(aggregator_unavailable_apiservice_total{job="apiserver"}[1m])) > 0 + for: 10m labels: severity: warning syn: 'true' diff --git a/tests/golden/release-4.14/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/release-4.14/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index 13a95e1e..c8e16f04 100644 --- a/tests/golden/release-4.14/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/release-4.14/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -1058,13 +1058,14 @@ spec: syn_component: openshift4-monitoring - alert: SYN_KubeAggregatedAPIErrors annotations: - description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace - }} has reported errors. It has appeared unavailable {{ $value | humanize - }} times averaged over the past 10m. + description: Kubernetes aggregated API {{ $labels.instance }}/{{ $labels.name + }} has reported {{ $labels.reason }} errors on cluster {{ $labels.cluster + }}. summary: Kubernetes aggregated API has reported errors. syn_component: openshift4-monitoring expr: | - sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total{job="apiserver"}[10m])) > 4 + sum by(cluster, instance, name, reason)(increase(aggregator_unavailable_apiservice_total{job="apiserver"}[1m])) > 0 + for: 10m labels: severity: warning syn: 'true' diff --git a/tests/golden/release-4.15/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/release-4.15/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index 1df74991..ee419e5e 100644 --- a/tests/golden/release-4.15/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/release-4.15/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -1077,13 +1077,14 @@ spec: syn_component: openshift4-monitoring - alert: SYN_KubeAggregatedAPIErrors annotations: - description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace - }} has reported errors. It has appeared unavailable {{ $value | humanize - }} times averaged over the past 10m. + description: Kubernetes aggregated API {{ $labels.instance }}/{{ $labels.name + }} has reported {{ $labels.reason }} errors on cluster {{ $labels.cluster + }}. summary: Kubernetes aggregated API has reported errors. syn_component: openshift4-monitoring expr: | - sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total{job="apiserver"}[10m])) > 4 + sum by(cluster, instance, name, reason)(increase(aggregator_unavailable_apiservice_total{job="apiserver"}[1m])) > 0 + for: 10m labels: severity: warning syn: 'true' diff --git a/tests/golden/release-4.16/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/release-4.16/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index ef3122d7..9e58f202 100644 --- a/tests/golden/release-4.16/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/release-4.16/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -1083,14 +1083,15 @@ spec: syn_component: openshift4-monitoring - alert: SYN_KubeAggregatedAPIErrors annotations: - description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace - }} has reported errors. It has appeared unavailable {{ $value | humanize - }} times averaged over the past 10m. + description: Kubernetes aggregated API {{ $labels.instance }}/{{ $labels.name + }} has reported {{ $labels.reason }} errors on cluster {{ $labels.cluster + }}. runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubeAggregatedAPIErrors.md summary: Kubernetes aggregated API has reported errors. syn_component: openshift4-monitoring expr: | - sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total{job="apiserver"}[10m])) > 4 + sum by(cluster, instance, name, reason)(increase(aggregator_unavailable_apiservice_total{job="apiserver"}[1m])) > 0 + for: 10m labels: severity: warning syn: 'true' diff --git a/tests/golden/remote-write/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/remote-write/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index 1df74991..ee419e5e 100644 --- a/tests/golden/remote-write/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/remote-write/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -1077,13 +1077,14 @@ spec: syn_component: openshift4-monitoring - alert: SYN_KubeAggregatedAPIErrors annotations: - description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace - }} has reported errors. It has appeared unavailable {{ $value | humanize - }} times averaged over the past 10m. + description: Kubernetes aggregated API {{ $labels.instance }}/{{ $labels.name + }} has reported {{ $labels.reason }} errors on cluster {{ $labels.cluster + }}. summary: Kubernetes aggregated API has reported errors. syn_component: openshift4-monitoring expr: | - sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total{job="apiserver"}[10m])) > 4 + sum by(cluster, instance, name, reason)(increase(aggregator_unavailable_apiservice_total{job="apiserver"}[1m])) > 0 + for: 10m labels: severity: warning syn: 'true' diff --git a/tests/golden/team-routing/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/team-routing/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index 573eb608..9a87a3a2 100644 --- a/tests/golden/team-routing/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/team-routing/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -1132,13 +1132,14 @@ spec: syn_team: clumsy-donkeys - alert: SYN_KubeAggregatedAPIErrors annotations: - description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace - }} has reported errors. It has appeared unavailable {{ $value | humanize - }} times averaged over the past 10m. + description: Kubernetes aggregated API {{ $labels.instance }}/{{ $labels.name + }} has reported {{ $labels.reason }} errors on cluster {{ $labels.cluster + }}. summary: Kubernetes aggregated API has reported errors. syn_component: openshift4-monitoring expr: | - sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total{job="apiserver"}[10m])) > 4 + sum by(cluster, instance, name, reason)(increase(aggregator_unavailable_apiservice_total{job="apiserver"}[1m])) > 0 + for: 10m labels: severity: warning syn: 'true' diff --git a/tests/golden/user-workload-monitoring/openshift4-monitoring/openshift4-monitoring/20_networkpolicy.yaml b/tests/golden/user-workload-monitoring/openshift4-monitoring/openshift4-monitoring/20_networkpolicy.yaml index e7e81d7c..b400430f 100644 --- a/tests/golden/user-workload-monitoring/openshift4-monitoring/openshift4-monitoring/20_networkpolicy.yaml +++ b/tests/golden/user-workload-monitoring/openshift4-monitoring/openshift4-monitoring/20_networkpolicy.yaml @@ -60,3 +60,20 @@ spec: - alertmanager policyTypes: - Ingress +--- +apiVersion: cilium.io/v2 +kind: CiliumNetworkPolicy +metadata: + annotations: + syn.tools/description: | + Note that this policy isn't named allow-from-cluster-nodes, even + though its content is identical to ensure that Espejo doesn't delete + the policy. + name: allow-from-cluster-nodes-custom + namespace: openshift-monitoring +spec: + endpointSelector: {} + ingress: + - fromEntities: + - host + - remote-node diff --git a/tests/golden/user-workload-monitoring/openshift4-monitoring/openshift4-monitoring/20_user_workload_networkpolicy.yaml b/tests/golden/user-workload-monitoring/openshift4-monitoring/openshift4-monitoring/20_user_workload_networkpolicy.yaml index ad572958..5ba5074c 100644 --- a/tests/golden/user-workload-monitoring/openshift4-monitoring/openshift4-monitoring/20_user_workload_networkpolicy.yaml +++ b/tests/golden/user-workload-monitoring/openshift4-monitoring/openshift4-monitoring/20_user_workload_networkpolicy.yaml @@ -60,3 +60,20 @@ spec: - alertmanager policyTypes: - Ingress +--- +apiVersion: cilium.io/v2 +kind: CiliumNetworkPolicy +metadata: + annotations: + syn.tools/description: | + Note that this policy isn't named allow-from-cluster-nodes, even + though its content is identical to ensure that Espejo doesn't delete + the policy. + name: allow-from-cluster-nodes-custom + namespace: openshift-user-workload-monitoring +spec: + endpointSelector: {} + ingress: + - fromEntities: + - host + - remote-node diff --git a/tests/golden/user-workload-monitoring/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/user-workload-monitoring/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index d2790e52..cf47410a 100644 --- a/tests/golden/user-workload-monitoring/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/user-workload-monitoring/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -1077,13 +1077,14 @@ spec: syn_component: openshift4-monitoring - alert: SYN_KubeAggregatedAPIErrors annotations: - description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace - }} has reported errors. It has appeared unavailable {{ $value | humanize - }} times averaged over the past 10m. + description: Kubernetes aggregated API {{ $labels.instance }}/{{ $labels.name + }} has reported {{ $labels.reason }} errors on cluster {{ $labels.cluster + }}. summary: Kubernetes aggregated API has reported errors. syn_component: openshift4-monitoring expr: | - sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total{job="apiserver"}[10m])) > 4 + sum by(cluster, instance, name, reason)(increase(aggregator_unavailable_apiservice_total{job="apiserver"}[1m])) > 0 + for: 10m labels: severity: warning syn: 'true' diff --git a/tests/golden/vsphere/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/vsphere/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index 710272cf..7a8b9578 100644 --- a/tests/golden/vsphere/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/vsphere/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -1077,13 +1077,14 @@ spec: syn_component: openshift4-monitoring - alert: SYN_KubeAggregatedAPIErrors annotations: - description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace - }} has reported errors. It has appeared unavailable {{ $value | humanize - }} times averaged over the past 10m. + description: Kubernetes aggregated API {{ $labels.instance }}/{{ $labels.name + }} has reported {{ $labels.reason }} errors on cluster {{ $labels.cluster + }}. summary: Kubernetes aggregated API has reported errors. syn_component: openshift4-monitoring expr: | - sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total{job="apiserver"}[10m])) > 4 + sum by(cluster, instance, name, reason)(increase(aggregator_unavailable_apiservice_total{job="apiserver"}[1m])) > 0 + for: 10m labels: severity: warning syn: 'true' diff --git a/tests/user-workload-monitoring.yml b/tests/user-workload-monitoring.yml index 053ae105..ce694018 100644 --- a/tests/user-workload-monitoring.yml +++ b/tests/user-workload-monitoring.yml @@ -1,3 +1,5 @@ +applications: + - cilium parameters: kapitan: dependencies: