From 91128dccc2adb62187666451517a37880cfdbc92 Mon Sep 17 00:00:00 2001 From: jneo8 Date: Thu, 23 Nov 2023 19:24:46 +0800 Subject: [PATCH 1/4] feat(alert): Add collector fail alert --- src/prometheus_alert_rules/general.yaml | 13 +++++++ tests/unit/test_alert_rules/test_general.yaml | 35 +++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 src/prometheus_alert_rules/general.yaml create mode 100644 tests/unit/test_alert_rules/test_general.yaml diff --git a/src/prometheus_alert_rules/general.yaml b/src/prometheus_alert_rules/general.yaml new file mode 100644 index 00000000..78df8b7c --- /dev/null +++ b/src/prometheus_alert_rules/general.yaml @@ -0,0 +1,13 @@ +groups: +- name: HardwareObserver + rules: + - alert: CollectorFailed + expr: '{__name__=~"(.*)_collector_failed"} == 1' + for: 0m + labels: + severity: warning + annotations: + summary: Controller fetch failed. (instance {{ $labels.instance }}) + description: | + Collrector fetch failed. Please reach out to hardware-observer maintainers. + LABELS = {{ $labels }} diff --git a/tests/unit/test_alert_rules/test_general.yaml b/tests/unit/test_alert_rules/test_general.yaml new file mode 100644 index 00000000..61c10a3e --- /dev/null +++ b/tests/unit/test_alert_rules/test_general.yaml @@ -0,0 +1,35 @@ +rule_files: + - ../../../src/prometheus_alert_rules/general.yaml + +evaluation_interval: 1m + +tests: + - interval: 1m + input_series: + - series: ipmidcmicollector_collector_failed{instance="ubuntu-99", collector="IPMIDCMICollector"} + values: '1x15' + - series: ipmiselcollector_collector_failed{instance="ubuntu-99", collector="IPMISELCollector"} + values: '1x15' + + alert_rule_test: + - eval_time: 0m + alertname: CollectorFailed + exp_alerts: + - exp_labels: + severity: warning + instance: ubuntu-99 + collector: IPMIDCMICollector + exp_annotations: + summary: Controller fetch failed. (instance ubuntu-99) + description: | + Collrector fetch failed. Please reach out to hardware-observer maintainers. + LABELS = map[__name__:ipmidcmicollector_collector_failed collector:IPMIDCMICollector instance:ubuntu-99] + - exp_labels: + severity: warning + instance: ubuntu-99 + collector: IPMISELCollector + exp_annotations: + summary: Controller fetch failed. (instance ubuntu-99) + description: | + Collrector fetch failed. Please reach out to hardware-observer maintainers. + LABELS = map[__name__:ipmiselcollector_collector_failed collector:IPMISELCollector instance:ubuntu-99] From 4626d4c058c3c3fe3a5718e6c6c8be1521ec1285 Mon Sep 17 00:00:00 2001 From: jneo8 Date: Fri, 24 Nov 2023 15:44:54 +0800 Subject: [PATCH 2/4] fix: typo and wording fix --- src/prometheus_alert_rules/general.yaml | 4 ++-- tests/unit/test_alert_rules/test_general.yaml | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/prometheus_alert_rules/general.yaml b/src/prometheus_alert_rules/general.yaml index 78df8b7c..064764f1 100644 --- a/src/prometheus_alert_rules/general.yaml +++ b/src/prometheus_alert_rules/general.yaml @@ -7,7 +7,7 @@ groups: labels: severity: warning annotations: - summary: Controller fetch failed. (instance {{ $labels.instance }}) + summary: Collector failed. (instance {{ $labels.instance }}) description: | - Collrector fetch failed. Please reach out to hardware-observer maintainers. + A collector failed to fetch the metrics. Please reach out to hardware-observer maintainers. LABELS = {{ $labels }} diff --git a/tests/unit/test_alert_rules/test_general.yaml b/tests/unit/test_alert_rules/test_general.yaml index 61c10a3e..6ee264c5 100644 --- a/tests/unit/test_alert_rules/test_general.yaml +++ b/tests/unit/test_alert_rules/test_general.yaml @@ -20,16 +20,16 @@ tests: instance: ubuntu-99 collector: IPMIDCMICollector exp_annotations: - summary: Controller fetch failed. (instance ubuntu-99) + summary: Collector failed. (instance ubuntu-99) description: | - Collrector fetch failed. Please reach out to hardware-observer maintainers. + A collector failed to fetch the metrics. Please reach out to hardware-observer maintainers. LABELS = map[__name__:ipmidcmicollector_collector_failed collector:IPMIDCMICollector instance:ubuntu-99] - exp_labels: severity: warning instance: ubuntu-99 collector: IPMISELCollector exp_annotations: - summary: Controller fetch failed. (instance ubuntu-99) + summary: Collector failed. (instance ubuntu-99) description: | - Collrector fetch failed. Please reach out to hardware-observer maintainers. + A collector failed to fetch the metrics. Please reach out to hardware-observer maintainers. LABELS = map[__name__:ipmiselcollector_collector_failed collector:IPMISELCollector instance:ubuntu-99] From 0367d86fa54adbc83e4f7731697367e434829aee Mon Sep 17 00:00:00 2001 From: jneo8 Date: Mon, 27 Nov 2023 17:10:22 +0800 Subject: [PATCH 3/4] fix: Update severity level to critical --- src/prometheus_alert_rules/general.yaml | 2 +- tests/unit/test_alert_rules/test_general.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/prometheus_alert_rules/general.yaml b/src/prometheus_alert_rules/general.yaml index 064764f1..fb62d695 100644 --- a/src/prometheus_alert_rules/general.yaml +++ b/src/prometheus_alert_rules/general.yaml @@ -5,7 +5,7 @@ groups: expr: '{__name__=~"(.*)_collector_failed"} == 1' for: 0m labels: - severity: warning + severity: critical annotations: summary: Collector failed. (instance {{ $labels.instance }}) description: | diff --git a/tests/unit/test_alert_rules/test_general.yaml b/tests/unit/test_alert_rules/test_general.yaml index 6ee264c5..69970a0f 100644 --- a/tests/unit/test_alert_rules/test_general.yaml +++ b/tests/unit/test_alert_rules/test_general.yaml @@ -16,7 +16,7 @@ tests: alertname: CollectorFailed exp_alerts: - exp_labels: - severity: warning + severity: critical instance: ubuntu-99 collector: IPMIDCMICollector exp_annotations: @@ -25,7 +25,7 @@ tests: A collector failed to fetch the metrics. Please reach out to hardware-observer maintainers. LABELS = map[__name__:ipmidcmicollector_collector_failed collector:IPMIDCMICollector instance:ubuntu-99] - exp_labels: - severity: warning + severity: critical instance: ubuntu-99 collector: IPMISELCollector exp_annotations: From 365ae55ca634c39ca2aa5523b2abdcc28ffd38b6 Mon Sep 17 00:00:00 2001 From: jneo8 Date: Wed, 29 Nov 2023 18:30:23 +0800 Subject: [PATCH 4/4] fix: change alert severity to error --- src/prometheus_alert_rules/general.yaml | 2 +- tests/unit/test_alert_rules/test_general.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/prometheus_alert_rules/general.yaml b/src/prometheus_alert_rules/general.yaml index fb62d695..2b0ac09a 100644 --- a/src/prometheus_alert_rules/general.yaml +++ b/src/prometheus_alert_rules/general.yaml @@ -5,7 +5,7 @@ groups: expr: '{__name__=~"(.*)_collector_failed"} == 1' for: 0m labels: - severity: critical + severity: error annotations: summary: Collector failed. (instance {{ $labels.instance }}) description: | diff --git a/tests/unit/test_alert_rules/test_general.yaml b/tests/unit/test_alert_rules/test_general.yaml index 69970a0f..6c0cb053 100644 --- a/tests/unit/test_alert_rules/test_general.yaml +++ b/tests/unit/test_alert_rules/test_general.yaml @@ -16,7 +16,7 @@ tests: alertname: CollectorFailed exp_alerts: - exp_labels: - severity: critical + severity: error instance: ubuntu-99 collector: IPMIDCMICollector exp_annotations: @@ -25,7 +25,7 @@ tests: A collector failed to fetch the metrics. Please reach out to hardware-observer maintainers. LABELS = map[__name__:ipmidcmicollector_collector_failed collector:IPMIDCMICollector instance:ubuntu-99] - exp_labels: - severity: critical + severity: error instance: ubuntu-99 collector: IPMISELCollector exp_annotations: