diff --git a/docs/azure/prometheusAlerts.json b/docs/azure/prometheusAlerts.json index 442f6eb..b1bada3 100644 --- a/docs/azure/prometheusAlerts.json +++ b/docs/azure/prometheusAlerts.json @@ -518,12 +518,13 @@ }, { "alert": "KubeAggregatedAPIErrors", - "expr": "sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total{job=\"kube-apiserver\"}[10m])) > 4", + "expr": "sum by(cluster, instance, name, reason)(increase(aggregator_unavailable_apiservice_total{job=\"kube-apiserver\"}[1m])) > 0", + "for": "10m", "labels": { "severity": "warning" }, "annotations": { - "description": "Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m.", + "description": "Kubernetes aggregated API {{ $labels.instance }}/{{ $labels.name }} has reported {{ $labels.reason }} errors on cluster {{ $labels.cluster }}.", "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeaggregatedapierrors", "summary": "Kubernetes aggregated API has reported errors." } diff --git a/docs/azure/prometheusAlerts.yaml b/docs/azure/prometheusAlerts.yaml index bdb1329..614a262 100644 --- a/docs/azure/prometheusAlerts.yaml +++ b/docs/azure/prometheusAlerts.yaml @@ -390,11 +390,12 @@ groups: summary: Client certificate is about to expire. - alert: "KubeAggregatedAPIErrors" expr: |- - sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total{job="kube-apiserver"}[10m])) > 4 + sum by(cluster, instance, name, reason)(increase(aggregator_unavailable_apiservice_total{job="kube-apiserver"}[1m])) > 0 + for: 10m labels: severity: warning annotations: - description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m. + description: Kubernetes aggregated API {{ $labels.instance }}/{{ $labels.name }} has reported {{ $labels.reason }} errors on cluster {{ $labels.cluster }}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeaggregatedapierrors summary: Kubernetes aggregated API has reported errors. - alert: "KubeAggregatedAPIDown" diff --git a/docs/default/prometheusAlerts.json b/docs/default/prometheusAlerts.json index c792a6a..7b683b9 100644 --- a/docs/default/prometheusAlerts.json +++ b/docs/default/prometheusAlerts.json @@ -518,12 +518,13 @@ }, { "alert": "KubeAggregatedAPIErrors", - "expr": "sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total{job=\"kube-apiserver\"}[10m])) > 4", + "expr": "sum by(cluster, instance, name, reason)(increase(aggregator_unavailable_apiservice_total{job=\"kube-apiserver\"}[1m])) > 0", + "for": "10m", "labels": { "severity": "warning" }, "annotations": { - "description": "Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m.", + "description": "Kubernetes aggregated API {{ $labels.instance }}/{{ $labels.name }} has reported {{ $labels.reason }} errors.", "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeaggregatedapierrors", "summary": "Kubernetes aggregated API has reported errors." } diff --git a/docs/default/prometheusAlerts.yaml b/docs/default/prometheusAlerts.yaml index 43be321..d9ac54f 100644 --- a/docs/default/prometheusAlerts.yaml +++ b/docs/default/prometheusAlerts.yaml @@ -390,11 +390,12 @@ groups: summary: Client certificate is about to expire. - alert: "KubeAggregatedAPIErrors" expr: |- - sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total{job="kube-apiserver"}[10m])) > 4 + sum by(cluster, instance, name, reason)(increase(aggregator_unavailable_apiservice_total{job="kube-apiserver"}[1m])) > 0 + for: 10m labels: severity: warning annotations: - description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m. + description: Kubernetes aggregated API {{ $labels.instance }}/{{ $labels.name }} has reported {{ $labels.reason }} errors. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeaggregatedapierrors summary: Kubernetes aggregated API has reported errors. - alert: "KubeAggregatedAPIDown" diff --git a/docs/multicluster/prometheusAlerts.json b/docs/multicluster/prometheusAlerts.json index 1d9e181..a2337f2 100644 --- a/docs/multicluster/prometheusAlerts.json +++ b/docs/multicluster/prometheusAlerts.json @@ -518,12 +518,13 @@ }, { "alert": "KubeAggregatedAPIErrors", - "expr": "sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total{job=\"kube-apiserver\"}[10m])) > 4", + "expr": "sum by(cluster, instance, name, reason)(increase(aggregator_unavailable_apiservice_total{job=\"kube-apiserver\"}[1m])) > 0", + "for": "10m", "labels": { "severity": "warning" }, "annotations": { - "description": "Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m.", + "description": "Kubernetes aggregated API {{ $labels.instance }}/{{ $labels.name }} has reported {{ $labels.reason }} errors on cluster {{ $labels.cluster }}.", "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeaggregatedapierrors", "summary": "Kubernetes aggregated API has reported errors." } diff --git a/docs/multicluster/prometheusAlerts.yaml b/docs/multicluster/prometheusAlerts.yaml index d80fef5..2948b62 100644 --- a/docs/multicluster/prometheusAlerts.yaml +++ b/docs/multicluster/prometheusAlerts.yaml @@ -390,11 +390,12 @@ groups: summary: Client certificate is about to expire. - alert: "KubeAggregatedAPIErrors" expr: |- - sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total{job="kube-apiserver"}[10m])) > 4 + sum by(cluster, instance, name, reason)(increase(aggregator_unavailable_apiservice_total{job="kube-apiserver"}[1m])) > 0 + for: 10m labels: severity: warning annotations: - description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m. + description: Kubernetes aggregated API {{ $labels.instance }}/{{ $labels.name }} has reported {{ $labels.reason }} errors on cluster {{ $labels.cluster }}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeaggregatedapierrors summary: Kubernetes aggregated API has reported errors. - alert: "KubeAggregatedAPIDown" diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index e2cce49..7050272 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -48,8 +48,8 @@ "subdir": "" } }, - "version": "a3fbf21977deb89b7d843eb8371170c011ea6835", - "sum": "57zW2IGJ9zbYd8BI0qe6JkoWTRSMNiBUWC6+YcnEsWo=" + "version": "d6ab1a7cd86a7fe687284f81b385ad0a82cbc70e", + "sum": "6Wz4q0u0Sp8r4ywtJOIR8EWArLHJKzO64rjqeku7bY8=" } ], "legacyImports": false