From e798a84e0da8f65a1c895d2abdd4fa5f26ed6ead Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Fri, 8 Nov 2024 00:23:26 +0000 Subject: [PATCH 1/2] Update kubernetes-mixin digest to bdbf7f4 --- jsonnetfile.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnetfile.json b/jsonnetfile.json index 6f0f56a..edea84e 100644 --- a/jsonnetfile.json +++ b/jsonnetfile.json @@ -7,7 +7,7 @@ "subdir": "" } }, - "version": "c70f03daec5d7578ef697c9f5b93ea88a41fe0d7" + "version": "bdbf7f45cedf37d07567be7519fa4139043f9335" } ], "legacyImports": true, From ff5385f5b7cda172d05dc8860eb2ef9b409b28ef Mon Sep 17 00:00:00 2001 From: "bonddim-actions[bot]" <169367001+bonddim-actions[bot]@users.noreply.github.com> Date: Fri, 8 Nov 2024 00:24:28 +0000 Subject: [PATCH 2/2] Update files after Renovate update --- docs/azure/prometheusAlerts.json | 10 +++++----- docs/azure/prometheusAlerts.yaml | 10 +++++----- docs/default/prometheusAlerts.json | 10 +++++----- docs/default/prometheusAlerts.yaml | 10 +++++----- docs/multicluster/prometheusAlerts.json | 10 +++++----- docs/multicluster/prometheusAlerts.yaml | 10 +++++----- jsonnetfile.lock.json | 8 ++++---- 7 files changed, 34 insertions(+), 34 deletions(-) diff --git a/docs/azure/prometheusAlerts.json b/docs/azure/prometheusAlerts.json index 5898b41..e9aaa4d 100644 --- a/docs/azure/prometheusAlerts.json +++ b/docs/azure/prometheusAlerts.json @@ -122,13 +122,13 @@ }, { "alert": "KubeContainerWaiting", - "expr": "sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job=\"kube-state-metrics\"}) > 0", + "expr": "kube_pod_container_status_waiting_reason{reason!=\"CrashLoopBackOff\", job=\"kube-state-metrics\"} > 0", "for": "1h", "labels": { "severity": "warning" }, "annotations": { - "description": "pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour.", + "description": "pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour. (reason: \"{{ $labels.reason }}\").", "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting", "summary": "Pod container waiting longer than 1 hour" } @@ -308,7 +308,7 @@ }, { "alert": "CPUThrottlingHigh", - "expr": "sum(increase(container_cpu_cfs_throttled_periods_total{container!=\"\", }[5m])) by (cluster, container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total{}[5m])) by (cluster, container, pod, namespace) > (25 / 100)", + "expr": "sum(increase(container_cpu_cfs_throttled_periods_total{container!=\"\", job=\"cadvisor\", }[5m])) without (id, metrics_path, name, image, endpoint, job, node) / sum(increase(container_cpu_cfs_periods_total{job=\"cadvisor\", }[5m])) without (id, metrics_path, name, image, endpoint, job, node) > (25 / 100)", "for": "15m", "labels": { "severity": "info" @@ -492,7 +492,7 @@ "rules": [ { "alert": "KubeClientCertificateExpiration", - "expr": "apiserver_client_certificate_expiration_seconds_count{job=\"kube-apiserver\"} > 0 and on(cluster, job) histogram_quantile(0.01, sum by (cluster, job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"kube-apiserver\"}[5m]))) < 604800", + "expr": "histogram_quantile(0.01, sum without (namespace, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"kube-apiserver\"}[5m]))) < 604800 and on(job, cluster, instance) apiserver_client_certificate_expiration_seconds_count{job=\"kube-apiserver\"} > 0", "for": "5m", "labels": { "severity": "warning" @@ -505,7 +505,7 @@ }, { "alert": "KubeClientCertificateExpiration", - "expr": "apiserver_client_certificate_expiration_seconds_count{job=\"kube-apiserver\"} > 0 and on(cluster, job) histogram_quantile(0.01, sum by (cluster, job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"kube-apiserver\"}[5m]))) < 86400", + "expr": "histogram_quantile(0.01, sum without (namespace, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"kube-apiserver\"}[5m]))) < 86400 and on(job, cluster, instance) apiserver_client_certificate_expiration_seconds_count{job=\"kube-apiserver\"} > 0", "for": "5m", "labels": { "severity": "critical" diff --git a/docs/azure/prometheusAlerts.yaml b/docs/azure/prometheusAlerts.yaml index 880d1ef..d5c4d14 100644 --- a/docs/azure/prometheusAlerts.yaml +++ b/docs/azure/prometheusAlerts.yaml @@ -93,12 +93,12 @@ groups: summary: DaemonSet rollout is stuck. - alert: "KubeContainerWaiting" expr: |- - sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0 + kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", job="kube-state-metrics"} > 0 for: 1h labels: severity: warning annotations: - description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour. + description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour. (reason: "{{ $labels.reason }}").' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting summary: Pod container waiting longer than 1 hour - alert: "KubeDaemonSetNotScheduled" @@ -234,7 +234,7 @@ groups: summary: Namespace quota has exceeded the limits. - alert: "CPUThrottlingHigh" expr: |- - sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (cluster, container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total{}[5m])) by (cluster, container, pod, namespace) > (25 / 100) + sum(increase(container_cpu_cfs_throttled_periods_total{container!="", job="cadvisor", }[5m])) without (id, metrics_path, name, image, endpoint, job, node) / sum(increase(container_cpu_cfs_periods_total{job="cadvisor", }[5m])) without (id, metrics_path, name, image, endpoint, job, node) > (25 / 100) for: 15m labels: severity: info @@ -370,7 +370,7 @@ groups: rules: - alert: "KubeClientCertificateExpiration" expr: |- - apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on(cluster, job) histogram_quantile(0.01, sum by (cluster, job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 604800 + histogram_quantile(0.01, sum without (namespace, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 604800 and on(job, cluster, instance) apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 for: 5m labels: severity: warning @@ -380,7 +380,7 @@ groups: summary: Client certificate is about to expire. - alert: "KubeClientCertificateExpiration" expr: |- - apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on(cluster, job) histogram_quantile(0.01, sum by (cluster, job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 86400 + histogram_quantile(0.01, sum without (namespace, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 86400 and on(job, cluster, instance) apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 for: 5m labels: severity: critical diff --git a/docs/default/prometheusAlerts.json b/docs/default/prometheusAlerts.json index ebec107..27c4396 100644 --- a/docs/default/prometheusAlerts.json +++ b/docs/default/prometheusAlerts.json @@ -122,13 +122,13 @@ }, { "alert": "KubeContainerWaiting", - "expr": "sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job=\"kube-state-metrics\"}) > 0", + "expr": "kube_pod_container_status_waiting_reason{reason!=\"CrashLoopBackOff\", job=\"kube-state-metrics\"} > 0", "for": "1h", "labels": { "severity": "warning" }, "annotations": { - "description": "pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour.", + "description": "pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour. (reason: \"{{ $labels.reason }}\").", "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting", "summary": "Pod container waiting longer than 1 hour" } @@ -308,7 +308,7 @@ }, { "alert": "CPUThrottlingHigh", - "expr": "sum(increase(container_cpu_cfs_throttled_periods_total{container!=\"\", }[5m])) by (cluster, container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total{}[5m])) by (cluster, container, pod, namespace) > (25 / 100)", + "expr": "sum(increase(container_cpu_cfs_throttled_periods_total{container!=\"\", job=\"cadvisor\", }[5m])) without (id, metrics_path, name, image, endpoint, job, node) / sum(increase(container_cpu_cfs_periods_total{job=\"cadvisor\", }[5m])) without (id, metrics_path, name, image, endpoint, job, node) > (25 / 100)", "for": "15m", "labels": { "severity": "info" @@ -492,7 +492,7 @@ "rules": [ { "alert": "KubeClientCertificateExpiration", - "expr": "apiserver_client_certificate_expiration_seconds_count{job=\"kube-apiserver\"} > 0 and on(cluster, job) histogram_quantile(0.01, sum by (cluster, job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"kube-apiserver\"}[5m]))) < 604800", + "expr": "histogram_quantile(0.01, sum without (namespace, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"kube-apiserver\"}[5m]))) < 604800 and on(job, cluster, instance) apiserver_client_certificate_expiration_seconds_count{job=\"kube-apiserver\"} > 0", "for": "5m", "labels": { "severity": "warning" @@ -505,7 +505,7 @@ }, { "alert": "KubeClientCertificateExpiration", - "expr": "apiserver_client_certificate_expiration_seconds_count{job=\"kube-apiserver\"} > 0 and on(cluster, job) histogram_quantile(0.01, sum by (cluster, job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"kube-apiserver\"}[5m]))) < 86400", + "expr": "histogram_quantile(0.01, sum without (namespace, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"kube-apiserver\"}[5m]))) < 86400 and on(job, cluster, instance) apiserver_client_certificate_expiration_seconds_count{job=\"kube-apiserver\"} > 0", "for": "5m", "labels": { "severity": "critical" diff --git a/docs/default/prometheusAlerts.yaml b/docs/default/prometheusAlerts.yaml index 1881411..5bc068f 100644 --- a/docs/default/prometheusAlerts.yaml +++ b/docs/default/prometheusAlerts.yaml @@ -93,12 +93,12 @@ groups: summary: DaemonSet rollout is stuck. - alert: "KubeContainerWaiting" expr: |- - sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0 + kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", job="kube-state-metrics"} > 0 for: 1h labels: severity: warning annotations: - description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour. + description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour. (reason: "{{ $labels.reason }}").' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting summary: Pod container waiting longer than 1 hour - alert: "KubeDaemonSetNotScheduled" @@ -234,7 +234,7 @@ groups: summary: Namespace quota has exceeded the limits. - alert: "CPUThrottlingHigh" expr: |- - sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (cluster, container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total{}[5m])) by (cluster, container, pod, namespace) > (25 / 100) + sum(increase(container_cpu_cfs_throttled_periods_total{container!="", job="cadvisor", }[5m])) without (id, metrics_path, name, image, endpoint, job, node) / sum(increase(container_cpu_cfs_periods_total{job="cadvisor", }[5m])) without (id, metrics_path, name, image, endpoint, job, node) > (25 / 100) for: 15m labels: severity: info @@ -370,7 +370,7 @@ groups: rules: - alert: "KubeClientCertificateExpiration" expr: |- - apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on(cluster, job) histogram_quantile(0.01, sum by (cluster, job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 604800 + histogram_quantile(0.01, sum without (namespace, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 604800 and on(job, cluster, instance) apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 for: 5m labels: severity: warning @@ -380,7 +380,7 @@ groups: summary: Client certificate is about to expire. - alert: "KubeClientCertificateExpiration" expr: |- - apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on(cluster, job) histogram_quantile(0.01, sum by (cluster, job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 86400 + histogram_quantile(0.01, sum without (namespace, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 86400 and on(job, cluster, instance) apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 for: 5m labels: severity: critical diff --git a/docs/multicluster/prometheusAlerts.json b/docs/multicluster/prometheusAlerts.json index a280775..da56cee 100644 --- a/docs/multicluster/prometheusAlerts.json +++ b/docs/multicluster/prometheusAlerts.json @@ -122,13 +122,13 @@ }, { "alert": "KubeContainerWaiting", - "expr": "sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job=\"kube-state-metrics\"}) > 0", + "expr": "kube_pod_container_status_waiting_reason{reason!=\"CrashLoopBackOff\", job=\"kube-state-metrics\"} > 0", "for": "1h", "labels": { "severity": "warning" }, "annotations": { - "description": "pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour.", + "description": "pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour. (reason: \"{{ $labels.reason }}\").", "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting", "summary": "Pod container waiting longer than 1 hour" } @@ -308,7 +308,7 @@ }, { "alert": "CPUThrottlingHigh", - "expr": "sum(increase(container_cpu_cfs_throttled_periods_total{container!=\"\", }[5m])) by (cluster, container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total{}[5m])) by (cluster, container, pod, namespace) > (25 / 100)", + "expr": "sum(increase(container_cpu_cfs_throttled_periods_total{container!=\"\", job=\"cadvisor\", }[5m])) without (id, metrics_path, name, image, endpoint, job, node) / sum(increase(container_cpu_cfs_periods_total{job=\"cadvisor\", }[5m])) without (id, metrics_path, name, image, endpoint, job, node) > (25 / 100)", "for": "15m", "labels": { "severity": "info" @@ -492,7 +492,7 @@ "rules": [ { "alert": "KubeClientCertificateExpiration", - "expr": "apiserver_client_certificate_expiration_seconds_count{job=\"kube-apiserver\"} > 0 and on(cluster, job) histogram_quantile(0.01, sum by (cluster, job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"kube-apiserver\"}[5m]))) < 604800", + "expr": "histogram_quantile(0.01, sum without (namespace, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"kube-apiserver\"}[5m]))) < 604800 and on(job, cluster, instance) apiserver_client_certificate_expiration_seconds_count{job=\"kube-apiserver\"} > 0", "for": "5m", "labels": { "severity": "warning" @@ -505,7 +505,7 @@ }, { "alert": "KubeClientCertificateExpiration", - "expr": "apiserver_client_certificate_expiration_seconds_count{job=\"kube-apiserver\"} > 0 and on(cluster, job) histogram_quantile(0.01, sum by (cluster, job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"kube-apiserver\"}[5m]))) < 86400", + "expr": "histogram_quantile(0.01, sum without (namespace, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"kube-apiserver\"}[5m]))) < 86400 and on(job, cluster, instance) apiserver_client_certificate_expiration_seconds_count{job=\"kube-apiserver\"} > 0", "for": "5m", "labels": { "severity": "critical" diff --git a/docs/multicluster/prometheusAlerts.yaml b/docs/multicluster/prometheusAlerts.yaml index e5c0779..8c4fceb 100644 --- a/docs/multicluster/prometheusAlerts.yaml +++ b/docs/multicluster/prometheusAlerts.yaml @@ -93,12 +93,12 @@ groups: summary: DaemonSet rollout is stuck. - alert: "KubeContainerWaiting" expr: |- - sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0 + kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", job="kube-state-metrics"} > 0 for: 1h labels: severity: warning annotations: - description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour. + description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour. (reason: "{{ $labels.reason }}").' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting summary: Pod container waiting longer than 1 hour - alert: "KubeDaemonSetNotScheduled" @@ -234,7 +234,7 @@ groups: summary: Namespace quota has exceeded the limits. - alert: "CPUThrottlingHigh" expr: |- - sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (cluster, container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total{}[5m])) by (cluster, container, pod, namespace) > (25 / 100) + sum(increase(container_cpu_cfs_throttled_periods_total{container!="", job="cadvisor", }[5m])) without (id, metrics_path, name, image, endpoint, job, node) / sum(increase(container_cpu_cfs_periods_total{job="cadvisor", }[5m])) without (id, metrics_path, name, image, endpoint, job, node) > (25 / 100) for: 15m labels: severity: info @@ -370,7 +370,7 @@ groups: rules: - alert: "KubeClientCertificateExpiration" expr: |- - apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on(cluster, job) histogram_quantile(0.01, sum by (cluster, job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 604800 + histogram_quantile(0.01, sum without (namespace, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 604800 and on(job, cluster, instance) apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 for: 5m labels: severity: warning @@ -380,7 +380,7 @@ groups: summary: Client certificate is about to expire. - alert: "KubeClientCertificateExpiration" expr: |- - apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on(cluster, job) histogram_quantile(0.01, sum by (cluster, job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 86400 + histogram_quantile(0.01, sum without (namespace, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 86400 and on(job, cluster, instance) apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 for: 5m labels: severity: critical diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 6717ebf..4de8d3f 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "gen/grafonnet-latest" } }, - "version": "1ce5aec95ce32336fe47c8881361847c475b5254", + "version": "82a19822e54a0a12a51e24dbd48fcde717dc0864", "sum": "64fMUPI3frXGj4X1FqFd1t7r04w3CUSmXaDcJ23EYbQ=" }, { @@ -18,7 +18,7 @@ "subdir": "gen/grafonnet-v11.1.0" } }, - "version": "1ce5aec95ce32336fe47c8881361847c475b5254", + "version": "82a19822e54a0a12a51e24dbd48fcde717dc0864", "sum": "41w7p/rwrNsITqNHMXtGSJAfAyKmnflg6rFhKBduUxM=" }, { @@ -48,8 +48,8 @@ "subdir": "" } }, - "version": "0348e09edc3961a29a55f199d1bf0060c847a608", - "sum": "kTZuZcE+pNw8ZVZECKxrZG4F9BS+ydWMcgACE9oUrRc=" + "version": "bdbf7f45cedf37d07567be7519fa4139043f9335", + "sum": "j4EAKfqkbPvBFGnBjt4hex2bdNHPpuFWrCxfq5L6EkU=" } ], "legacyImports": false