Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Conserve labels in query result, add selector support #8

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion alerts/absent.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
{
local alert = 'CertManagerAbsent',
alert: alert,
expr: 'absent(up{job="%(certManagerJobLabel)s"})' % $._config,
expr: 'absent(up{%s})' % $._config.certManagerSelector,
'for': '10m',
labels: {
severity: 'critical',
Expand Down
19 changes: 8 additions & 11 deletions alerts/certificates.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,9 @@
local alert = 'CertManagerCertExpirySoon',
alert: alert,
expr: |||
avg by (exported_namespace, namespace, name) (
certmanager_certificate_expiration_timestamp_seconds - time()
) < (%s * 24 * 3600) # 21 days in seconds
||| % $._config.certManagerCertExpiryDays,
certmanager_certificate_expiration_timestamp_seconds{%s} - time()
< (%s * 24 * 3600) # 21 days in seconds
||| % [$._config.certManagerSelector, $._config.certManagerCertExpiryDays],
'for': '1h',
labels: {
severity: 'warning',
Expand All @@ -26,10 +25,8 @@
local alert = 'CertManagerCertNotReady',
alert: alert,
expr: |||
max by (name, exported_namespace, namespace, condition) (
certmanager_certificate_ready_status{condition!="True"} == 1
)
|||,
certmanager_certificate_ready_status{%s, condition!="True"} == 1
||| % $._config.certManagerSelector,
'for': '10m',
labels: {
severity: 'critical',
Expand All @@ -45,10 +42,10 @@
local alert = 'CertManagerHittingRateLimits',
alert: alert,
expr: |||
sum by (host) (
rate(certmanager_http_acme_client_request_count{status="429"}[5m])
sum without (method, path) (
rate(certmanager_http_acme_client_request_count{%s, status="429"}[5m])
) > 0
|||,
||| % $._config.certManagerSelector,
'for': '5m',
labels: {
severity: 'critical',
Expand Down
2 changes: 1 addition & 1 deletion config.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
{
_config+:: {
certManagerCertExpiryDays: '21',
certManagerJobLabel: 'cert-manager',
certManagerSelector: 'job="cert-manager"',
certManagerRunbookURLPattern: 'https://github.com/imusmanmalik/cert-manager-mixin/blob/main/RUNBOOK.md#%s',
grafanaExternalUrl: 'https://grafana.example.com',

Expand Down
22 changes: 14 additions & 8 deletions tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,19 +24,21 @@ tests:
# Cert expiry
- interval: 1m
input_series:
- series: certmanager_certificate_expiration_timestamp_seconds{namespace="cert-manager", exported_namespace="test", name="expired-ingress-cert", foo="bar"}
- series: certmanager_certificate_expiration_timestamp_seconds{job="cert-manager", namespace="cert-manager", exported_namespace="test", name="expired-ingress-cert", foo="bar"}
values: 1814400+0x43200 # 21d in seconds, static for 30d of samples
- series: certmanager_certificate_expiration_timestamp_seconds{namespace="cert-manager", exported_namespace="test", name="90d-ingress-cert"}
- series: certmanager_certificate_expiration_timestamp_seconds{job="cert-manager", namespace="cert-manager", exported_namespace="test", name="90d-ingress-cert"}
values: 7776000+0x43200 # 90d in seconds, static for 30d of samples
alert_rule_test:
- eval_time: 61m
alertname: CertManagerCertExpirySoon
exp_alerts:
- exp_labels:
severity: warning
job: cert-manager
exported_namespace: test
namespace: cert-manager
name: expired-ingress-cert
foo: bar
exp_annotations:
summary: The cert `expired-ingress-cert` is 20d 22h 59m 0s from expiry, it should have renewed over a week ago.
description: "The domain that this cert covers will be unavailable after 20d 22h 59m 0s. Clients using endpoints that this cert protects will start to fail in 20d 22h 59m 0s."
Expand All @@ -46,18 +48,19 @@ tests:
# Cert not ready
- interval: 1m
input_series:
- series: certmanager_certificate_ready_status{namespace="cert-manager", exported_namespace="test", name="ready", condition="True"}
- series: certmanager_certificate_ready_status{job="cert-manager", namespace="cert-manager", exported_namespace="test", name="ready", condition="True"}
values: 1+0x30
- series: certmanager_certificate_ready_status{namespace="cert-manager", exported_namespace="test", name="not ready", condition="False"}
- series: certmanager_certificate_ready_status{job="cert-manager", namespace="cert-manager", exported_namespace="test", name="not ready", condition="False"}
values: 1+0x30
- series: certmanager_certificate_ready_status{namespace="cert-manager", exported_namespace="test", name="who knows", condition="Unknown"}
- series: certmanager_certificate_ready_status{job="cert-manager", namespace="cert-manager", exported_namespace="test", name="who knows", condition="Unknown"}
values: 1+0x30
alert_rule_test:
- eval_time: 10m
alertname: CertManagerCertNotReady
exp_alerts:
- exp_labels:
severity: critical
job: cert-manager
exported_namespace: test
namespace: cert-manager
name: not ready
Expand All @@ -71,6 +74,7 @@ tests:
runbook_url: "https://github.com/imusmanmalik/cert-manager-mixin/blob/main/RUNBOOK.md#certmanagercertnotready"
- exp_labels:
severity: critical
job: cert-manager
exported_namespace: test
namespace: cert-manager
name: who knows
Expand All @@ -86,19 +90,21 @@ tests:
# cert-manager rate limits
- interval: 1m
input_series:
- series: certmanager_http_acme_client_request_count{status="200", host="normal.acme-v02.api.letsencrypt.org", path="/acme/new-order"}
- series: certmanager_http_acme_client_request_count{job="cert-manager", status="200", host="normal.acme-v02.api.letsencrypt.org", path="/acme/new-order"}
values: 1+1x30
- series: certmanager_http_acme_client_request_count{status="429", host="rate-limited.acme-v02.api.letsencrypt.org", path="/acme/new-order"}
- series: certmanager_http_acme_client_request_count{job="cert-manager", status="429", host="rate-limited.acme-v02.api.letsencrypt.org", path="/acme/new-order"}
values: 1+1x30
- series: certmanager_http_acme_client_request_count{status="429", host="one-limited-request.acme-v02.api.letsencrypt.org", path="/acme/new-order"}
- series: certmanager_http_acme_client_request_count{job="cert-manager", status="429", host="one-limited-request.acme-v02.api.letsencrypt.org", path="/acme/new-order"}
values: 1+0x30
alert_rule_test:
- eval_time: 10m
alertname: CertManagerHittingRateLimits
exp_alerts:
- exp_labels:
severity: critical
job: cert-manager
host: rate-limited.acme-v02.api.letsencrypt.org
status: 429
exp_annotations:
summary: "Cert manager hitting LetsEncrypt rate limits."
description: "Depending on the rate limit, cert-manager may be unable to generate certificates for up to a week."
Expand Down