Skip to content

Commit

Permalink
fix(alerts): Increase duration before which ipmi alerts get triggered (
Browse files Browse the repository at this point in the history
  • Loading branch information
dashmage authored Mar 25, 2024
1 parent 7f41663 commit dfabda2
Show file tree
Hide file tree
Showing 6 changed files with 22 additions and 22 deletions.
2 changes: 1 addition & 1 deletion src/prometheus_alert_rules/ipmi_dcmi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ groups:

- alert: IPMIDCMICommandFailed
expr: ipmi_dcmi_command_success == 0
for: 0m
for: 5m
labels:
severity: critical
annotations:
Expand Down
6 changes: 3 additions & 3 deletions src/prometheus_alert_rules/ipmi_sel.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ groups:

- alert: IPMISELCommandFailed
expr: ipmi_sel_command_success == 0
for: 0m
for: 5m
labels:
severity: critical
annotations:
Expand All @@ -16,7 +16,7 @@ groups:
- alert: IPMISELStateWarning
expr: ipmi_sel_state == 1
for: 0m
for: 5m
labels:
severity: warning
annotations:
Expand All @@ -27,7 +27,7 @@ groups:
- alert: IPMISELStateCritical
expr: ipmi_sel_state == 2
for: 0m
for: 5m
labels:
severity: critical
annotations:
Expand Down
14 changes: 7 additions & 7 deletions src/prometheus_alert_rules/ipmi_sensors.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ groups:

- alert: IPMIMonitoringCommandFailed
expr: ipmimonitoring_command_success == 0
for: 0m
for: 5m
labels:
severity: critical
annotations:
Expand All @@ -16,7 +16,7 @@ groups:
- alert: IPMITemperatureStateNotOk
expr: ipmi_temperature_celsius{state=~"Warning|Critical"}
for: 0m
for: 5m
labels:
severity: "{{ toLower $labels.state }}"
annotations:
Expand All @@ -28,7 +28,7 @@ groups:
- alert: IPMIPowerStateNotOk
expr: ipmi_power_watts{state=~"Warning|Critical"}
for: 0m
for: 5m
labels:
severity: "{{ toLower $labels.state }}"
annotations:
Expand All @@ -40,7 +40,7 @@ groups:
- alert: IPMIVoltageStateNotOk
expr: ipmi_voltage_volts{state=~"Warning|Critical"}
for: 0m
for: 5m
labels:
severity: "{{ toLower $labels.state }}"
annotations:
Expand All @@ -52,7 +52,7 @@ groups:
- alert: IPMICurrentStateNotOk
expr: ipmi_current_amperes{state=~"Warning|Critical"}
for: 0m
for: 5m
labels:
severity: "{{ toLower $labels.state }}"
annotations:
Expand All @@ -64,7 +64,7 @@ groups:
- alert: IPMIFanSpeedStateNotOk
expr: ipmi_fan_speed_rpm{state=~"Warning|Critical"}
for: 0m
for: 5m
labels:
severity: "{{ toLower $labels.state }}"
annotations:
Expand All @@ -78,7 +78,7 @@ groups:
# Slot Connector sensors are ignored since they raise a high number of false positive alerts
- alert: IPMISensorStateNotOk
expr: ipmi_generic_sensor_value{state=~"Warning|Critical", type!~"Entity\\sPresence|Slot/Connector"}
for: 0m
for: 5m
labels:
severity: "{{ toLower $labels.state }}"
annotations:
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/test_alert_rules/test_ipmi_dcmi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ tests:
values: '0.75x15'

alert_rule_test:
- eval_time: 0m
- eval_time: 10m
alertname: IPMIDCMICommandFailed
exp_alerts:
- exp_labels:
Expand Down
6 changes: 3 additions & 3 deletions tests/unit/test_alert_rules/test_ipmi_sel.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ tests:
values: '2x15'

alert_rule_test:
- eval_time: 0m
- eval_time: 10m
alertname: IPMISELCommandFailed
exp_alerts:
- exp_labels:
Expand All @@ -33,7 +33,7 @@ tests:
VALUE = 0
LABELS = map[__name__:ipmi_sel_command_success instance:ubuntu-0]
- eval_time: 0m
- eval_time: 10m
alertname: IPMISELStateWarning
exp_alerts:
- exp_labels:
Expand All @@ -45,7 +45,7 @@ tests:
IPMI SEL entry in warning state.
LABELS = map[__name__:ipmi_sel_state instance:ubuntu-1]
- eval_time: 0m
- eval_time: 10m
alertname: IPMISELStateCritical
exp_alerts:
- exp_labels:
Expand Down
14 changes: 7 additions & 7 deletions tests/unit/test_alert_rules/test_ipmi_sensors.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ tests:
values: '0x15'

alert_rule_test:
- eval_time: 0m
- eval_time: 10m
alertname: IPMIMonitoringCommandFailed
exp_alerts:
- exp_labels:
Expand All @@ -35,7 +35,7 @@ tests:
values: '120x15'

alert_rule_test:
- eval_time: 0m
- eval_time: 10m
alertname: IPMITemperatureStateNotOk
exp_alerts:
- exp_labels:
Expand Down Expand Up @@ -71,7 +71,7 @@ tests:
values: '240x15'

alert_rule_test:
- eval_time: 0m
- eval_time: 10m
alertname: IPMIPowerStateNotOk
exp_alerts:
- exp_labels:
Expand Down Expand Up @@ -107,7 +107,7 @@ tests:
values: '280x15'

alert_rule_test:
- eval_time: 0m
- eval_time: 10m
alertname: IPMIVoltageStateNotOk
exp_alerts:
- exp_labels:
Expand Down Expand Up @@ -144,7 +144,7 @@ tests:
values: '200x15'

alert_rule_test:
- eval_time: 0m
- eval_time: 10m
alertname: IPMICurrentStateNotOk
exp_alerts:
- exp_labels:
Expand Down Expand Up @@ -180,7 +180,7 @@ tests:
values: '4000x15'

alert_rule_test:
- eval_time: 0m
- eval_time: 10m
alertname: IPMIFanSpeedStateNotOk
exp_alerts:
- exp_labels:
Expand Down Expand Up @@ -220,7 +220,7 @@ tests:
values: '50x15'

alert_rule_test:
- eval_time: 0m
- eval_time: 10m
alertname: IPMISensorStateNotOk
exp_alerts:
- exp_labels:
Expand Down

0 comments on commit dfabda2

Please sign in to comment.