-
Notifications
You must be signed in to change notification settings - Fork 24
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added alertmanager and prometheus alerting rules
- Loading branch information
Showing
14 changed files
with
1,315 additions
and
39 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,224 @@ | ||
groups: | ||
- name: alert.rules | ||
rules: | ||
|
||
# Since we're configuring prometheus on this host, I doubt this alert will ever trigger. | ||
- alert: InstanceDown | ||
expr: up == 0 | ||
for: 1m | ||
labels: | ||
severity: critical | ||
annotations: | ||
description: '{{ $labels.instance }} of job {{ $labels.job }} has been down | ||
for more than 1 minute.' | ||
summary: Instance {{ $labels.instance }} down | ||
|
||
# CPU Usage Alerts | ||
- alert: NodeCPUUsageWarning | ||
expr: (100 - (avg(irate(node_cpu{mode="idle",name="node-exporter"}[5m])) BY (instance) | ||
* 100)) > 75 | ||
for: 3m | ||
labels: | ||
severity: warning | ||
annotations: | ||
DESCRIPTION: '{{$labels.instance}}: CPU usage is above 75% (current value is: | ||
{{ $value }})' | ||
SUMMARY: '{{$labels.instance}}: High CPU usage detected' | ||
|
||
- alert: NodeCPUUsageCritical | ||
expr: (100 - (avg(irate(node_cpu{mode="idle",name="node-exporter"}[5m])) BY (instance) | ||
* 100)) > 90 | ||
for: 3m | ||
labels: | ||
severity: critical | ||
annotations: | ||
DESCRIPTION: '{{$labels.instance}}: CPU usage is above 90% (current value is: | ||
{{ $value }})' | ||
SUMMARY: '{{$labels.instance}}: High CPU usage detected' | ||
|
||
|
||
# Load avg Alerts | ||
- alert: NodeLoadAverageWarning | ||
expr: ((node_load5 / count(node_cpu{mode="system"}) WITHOUT (cpu, mode)) > 4) | ||
for: 5m | ||
labels: | ||
severity: warning | ||
annotations: | ||
DESCRIPTION: '{{$labels.instance}}: LA is high' | ||
SUMMARY: '{{$labels.instance}}: High LA detected' | ||
|
||
- alert: NodeLoadAverageCritical | ||
expr: ((node_load5 / count(node_cpu{mode="system"}) WITHOUT (cpu, mode)) > 7) | ||
for: 5m | ||
labels: | ||
severity: critical | ||
annotations: | ||
DESCRIPTION: '{{$labels.instance}}: LA is very high' | ||
SUMMARY: '{{$labels.instance}}: Very high LA detected' | ||
|
||
|
||
# Low Disk Space Alerts | ||
- alert: NodeLowRootDiskWarning | ||
expr: ((node_filesystem_size{mountpoint="/root-disk"} - node_filesystem_free{mountpoint="/root-disk"}) | ||
/ node_filesystem_size{mountpoint="/root-disk"} * 100) > 80 | ||
for: 2m | ||
labels: | ||
severity: warning | ||
annotations: | ||
DESCRIPTION: '{{$labels.instance}}: Root disk usage is above 80% (current value | ||
is: {{ $value }})' | ||
SUMMARY: '{{$labels.instance}}: Low root disk space' | ||
|
||
- alert: NodeLowRootDiskCritical | ||
expr: ((node_filesystem_size{mountpoint="/root-disk"} - node_filesystem_free{mountpoint="/root-disk"}) | ||
/ node_filesystem_size{mountpoint="/root-disk"} * 100) > 95 | ||
for: 2m | ||
labels: | ||
severity: critical | ||
annotations: | ||
DESCRIPTION: '{{$labels.instance}}: Root disk usage is above 95% (current value | ||
is: {{ $value }})' | ||
SUMMARY: '{{$labels.instance}}: Low root disk space' | ||
|
||
# Swap Usage Alerts | ||
- alert: NodeSwapUsageWarning | ||
expr: (((node_memory_SwapTotal - node_memory_SwapFree) / node_memory_SwapTotal) | ||
* 100) > 80 | ||
for: 3m | ||
labels: | ||
severity: warning | ||
annotations: | ||
DESCRIPTION: '{{$labels.instance}}: Swap usage usage is above 80% (current value | ||
is: {{ $value }})' | ||
SUMMARY: '{{$labels.instance}}: Swap usage detected' | ||
|
||
- alert: NodeSwapUsageCritical | ||
expr: (((node_memory_SwapTotal - node_memory_SwapFree) / node_memory_SwapTotal) | ||
* 100) > 95 | ||
for: 3m | ||
labels: | ||
severity: critical | ||
annotations: | ||
DESCRIPTION: '{{$labels.instance}}: Swap usage usage is above 95% (current value | ||
is: {{ $value }})' | ||
SUMMARY: '{{$labels.instance}}: Swap usage detected' | ||
|
||
# Memory Usage Alerts | ||
- alert: NodeMemoryUsageWarning | ||
expr: (((node_memory_MemTotal - node_memory_MemFree - node_memory_Cached) / (node_memory_MemTotal) | ||
* 100)) > 80 | ||
for: 5m | ||
labels: | ||
severity: warning | ||
annotations: | ||
DESCRIPTION: '{{$labels.instance}}: Memory usage is above 80% (current value | ||
is: {{ $value }})' | ||
SUMMARY: '{{$labels.instance}}: High memory usage detected' | ||
|
||
- alert: NodeMemoryUsageCritical | ||
expr: (((node_memory_MemTotal - node_memory_MemFree - node_memory_Cached) / (node_memory_MemTotal) | ||
* 100)) > 95 | ||
for: 5m | ||
labels: | ||
severity: critical | ||
annotations: | ||
DESCRIPTION: '{{$labels.instance}}: Memory usage is above 95% (current value | ||
is: {{ $value }})' | ||
SUMMARY: '{{$labels.instance}}: High memory usage detected' | ||
|
||
# Latest Milestone Alerts | ||
- alert: LatestMileStoneWarning | ||
expr: iota_node_info_latest_milestone == 243000 | ||
for: 2h | ||
labels: | ||
severity: warning | ||
annotations: | ||
DESCRIPTION: '{{$labels.instance}}: Latest Milestone Reset (current value | ||
is: {{ $value }})' | ||
SUMMARY: '{{$labels.instance}}: Latest Milestone reset and stuck' | ||
|
||
- alert: LatestMileStoneCritical | ||
expr: iota_node_info_latest_milestone == 243000 | ||
for: 4h | ||
labels: | ||
severity: critical | ||
annotations: | ||
DESCRIPTION: '{{$labels.instance}}: Latest Milestone Reset (current value | ||
is: {{ $value }})' | ||
SUMMARY: '{{$labels.instance}}: Latest Milestone reset and stuck' | ||
|
||
- alert: LatestSubtangleMileStoneBehindWarning | ||
expr: (iota_node_info_latest_milestone - iota_node_info_latest_subtangle_milestone) | ||
> 5 | ||
for: 1h | ||
labels: | ||
severity: warning | ||
annotations: | ||
DESCRIPTION: '{{$labels.instance}}: Latest Subtangle Milestone lagging (current value | ||
is: {{ $value }})' | ||
SUMMARY: '{{$labels.instance}}: Latest Subtangle Milestone lagging' | ||
|
||
- alert: LatestSubtangleMileStoneBehindCritical | ||
expr: (iota_node_info_latest_milestone - iota_node_info_latest_subtangle_milestone) | ||
> 10 | ||
for: 1h | ||
labels: | ||
severity: critical | ||
annotations: | ||
DESCRIPTION: '{{$labels.instance}}: Latest Subtangle Milestone lagging (current value | ||
is: {{ $value }})' | ||
SUMMARY: '{{$labels.instance}}: Latest Subtangle Milestone lagging' | ||
|
||
# Neighbor Alerts | ||
- alert: TotalNeighborsFewWarning | ||
expr: iota_node_info_total_neighbors < 2 | ||
for: 1m | ||
labels: | ||
severity: warning | ||
annotations: | ||
DESCRIPTION: '{{$labels.instance}}: Too few neighbors (current value | ||
is: {{ $value }})' | ||
SUMMARY: '{{$labels.instance}}: Too few neighbors' | ||
|
||
- alert: TotalNeighborsActiveWarning | ||
expr: iota_node_info_total_neighbors > 11 | ||
for: 1h | ||
labels: | ||
severity: warning | ||
annotations: | ||
DESCRIPTION: '{{$labels.instance}}: Too many active neighbors (current value | ||
is: {{ $value }})' | ||
SUMMARY: '{{$labels.instance}}: Too many active neighbors' | ||
|
||
- alert: TotalNeighborsActiveCritical | ||
expr: iota_node_info_total_neighbors > 15 | ||
for: 1h | ||
labels: | ||
severity: critical | ||
annotations: | ||
DESCRIPTION: '{{$labels.instance}}: Too many active neighbors (current value | ||
is: {{ $value }})' | ||
SUMMARY: '{{$labels.instance}}: Too many active neighbors' | ||
|
||
- alert: InactiveNeighborsWarning | ||
expr: (iota_node_info_total_neighbors - iota_neighbors_active_neighbors) | ||
> 1 | ||
for: 1h | ||
labels: | ||
severity: warning | ||
annotations: | ||
DESCRIPTION: '{{$labels.instance}}: Inactive Neighbors (current value | ||
is: {{ $value }})' | ||
SUMMARY: '{{$labels.instance}}: Inactive Neighbors' | ||
|
||
- alert: InactiveNeighborsCritical | ||
expr: (iota_node_info_total_neighbors - iota_neighbors_active_neighbors) | ||
> 3 | ||
for: 1h | ||
labels: | ||
severity: critical | ||
annotations: | ||
DESCRIPTION: '{{$labels.instance}}: Inactive Neighbors (current value | ||
is: {{ $value }})' | ||
SUMMARY: '{{$labels.instance}}: Inactive Neighbors' | ||
|
Oops, something went wrong.