Skip to content

Commit

Permalink
Add nvme lifetime alert (canonical#268)
Browse files Browse the repository at this point in the history
  • Loading branch information
zxhdaze authored Jun 26, 2024
1 parent e4b0c79 commit f9276db
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 0 deletions.
26 changes: 26 additions & 0 deletions src/prometheus_alert_rules/smart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -99,3 +99,29 @@ groups:
on device "{{ $labels.device }}" is greater than 0.
VALUE = {{ $value }}
LABELS = {{ $labels }}
- alert: SmartNVMeDriveLifetimeWarning
expr: smartctl_device_percentage_used{device=~"nvme.*"} >= 80
for: 15m
labels:
severity: warning
annotations:
summary: NVMe drive is approaching its estimated lifetime (instance {{ $labels.instance }})
description: |
The NVMe drive has reached 80% of its estimated lifetime.
Note: A value of 100 does not indicate failure. For more details, visit https://charmhub.io/hardware-observer/docs/metrics-and-alerts-smart
VALUE = {{ $value }}
LABELS = {{ $labels }}
- alert: SmartNVMeDriveLifetimeCritical
expr: smartctl_device_percentage_used{device=~"nvme.*"} >= 90
for: 15m
labels:
severity: critical
annotations:
summary: NVMe drive is close to reaching its estimated lifetime (instance {{ $labels.instance }})
description: |
The NVMe drive has reached 90% of its estimated lifetime.
Note: A value of 100 does not indicate failure. For more details, visit https://charmhub.io/hardware-observer/docs/metrics-and-alerts-smart
VALUE = {{ $value }}
LABELS = {{ $labels }}
43 changes: 43 additions & 0 deletions tests/unit/test_alert_rules/test_smart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -188,3 +188,46 @@ tests:
on device "sda" is greater than 0.
VALUE = 2
LABELS = map[__name__:smartctl_device_attribute attribute_id:5 attribute_name:Reallocated_Sectors_Count device:sda instance:ubuntu-2]
- interval: 1m
input_series:
- series: 'smartctl_device_percentage_used{device="nvme", instance="ubuntu-3"}'
values: '85x20'

alert_rule_test:
- eval_time: 20m
alertname: SmartNVMeDriveLifetimeWarning
exp_alerts:
- exp_labels:
severity: warning
instance: ubuntu-3
device: nvme
exp_annotations:
summary: NVMe drive is approaching its estimated lifetime (instance ubuntu-3)
description: |
The NVMe drive has reached 80% of its estimated lifetime.
Note: A value of 100 does not indicate failure. For more details, visit https://charmhub.io/hardware-observer/docs/metrics-and-alerts-smart
VALUE = 85
LABELS = map[__name__:smartctl_device_percentage_used device:nvme instance:ubuntu-3]
- interval: 1m
input_series:
- series: 'smartctl_device_percentage_used{device="nvme", instance="ubuntu-4"}'
values: '95x20'

alert_rule_test:
- eval_time: 20m
alertname: SmartNVMeDriveLifetimeCritical
exp_alerts:
- exp_labels:
severity: critical
instance: ubuntu-4
device: nvme
exp_annotations:
summary: NVMe drive is close to reaching its estimated lifetime (instance ubuntu-4)
description: |
The NVMe drive has reached 90% of its estimated lifetime.
Note: A value of 100 does not indicate failure. For more details, visit https://charmhub.io/hardware-observer/docs/metrics-and-alerts-smart
VALUE = 95
LABELS = map[__name__:smartctl_device_percentage_used device:nvme instance:ubuntu-4]

0 comments on commit f9276db

Please sign in to comment.