+ {{ .Alerts | len }} alert{{ if gt (len .Alerts) 1 }}s{{ end }} for {{ range .GroupLabels.SortedPairs }} + {{ .Name }}={{ .Value }} + {{ end }} + | +||||||
+
|
+
diff --git a/group_vars/all/monitoring.yml b/group_vars/all/monitoring.yml index f35969a..7391408 100644 --- a/group_vars/all/monitoring.yml +++ b/group_vars/all/monitoring.yml @@ -17,3 +17,14 @@ nodesource_version: 8 iota_prom_exporter_basedir: iota-prom-exporter iota_prom_exporter_port: 9311 iota_prom_exporter_bind: 127.0.0.1 + +alertmanager_basedir: /opt/prometheus/alertmanager +alertmanager_version: 0.12.0 +alertmanager_port: 9093 +alertmanager_nginx_port: 9993 +alertmanager_bind: 127.0.0.1 +alertmanager_email_from: alertmanager +alertmanager_email_to: root@localhost +alertmanager_loglevel: info +smtp_host: localhost +smtp_port: 25 diff --git a/roles/monitoring/files/alert.rules.yml b/roles/monitoring/files/alert.rules.yml new file mode 100644 index 0000000..3ddcf44 --- /dev/null +++ b/roles/monitoring/files/alert.rules.yml @@ -0,0 +1,224 @@ +groups: +- name: alert.rules + rules: + + # Since we're configuring prometheus on this host, I doubt this alert will ever trigger. + - alert: InstanceDown + expr: up == 0 + for: 1m + labels: + severity: critical + annotations: + description: '{{ $labels.instance }} of job {{ $labels.job }} has been down + for more than 1 minute.' + summary: Instance {{ $labels.instance }} down + + # CPU Usage Alerts + - alert: NodeCPUUsageWarning + expr: (100 - (avg(irate(node_cpu{mode="idle",name="node-exporter"}[5m])) BY (instance) + * 100)) > 75 + for: 3m + labels: + severity: warning + annotations: + DESCRIPTION: '{{$labels.instance}}: CPU usage is above 75% (current value is: + {{ $value }})' + SUMMARY: '{{$labels.instance}}: High CPU usage detected' + + - alert: NodeCPUUsageCritical + expr: (100 - (avg(irate(node_cpu{mode="idle",name="node-exporter"}[5m])) BY (instance) + * 100)) > 90 + for: 3m + labels: + severity: critical + annotations: + DESCRIPTION: '{{$labels.instance}}: CPU usage is above 90% (current value is: + {{ $value }})' + SUMMARY: '{{$labels.instance}}: High CPU usage detected' + + + # Load avg Alerts + - alert: NodeLoadAverageWarning + expr: ((node_load5 / count(node_cpu{mode="system"}) WITHOUT (cpu, mode)) > 4) + for: 5m + labels: + severity: warning + annotations: + DESCRIPTION: '{{$labels.instance}}: LA is high' + SUMMARY: '{{$labels.instance}}: High LA detected' + + - alert: NodeLoadAverageCritical + expr: ((node_load5 / count(node_cpu{mode="system"}) WITHOUT (cpu, mode)) > 7) + for: 5m + labels: + severity: critical + annotations: + DESCRIPTION: '{{$labels.instance}}: LA is very high' + SUMMARY: '{{$labels.instance}}: Very high LA detected' + + + # Low Disk Space Alerts + - alert: NodeLowRootDiskWarning + expr: ((node_filesystem_size{mountpoint="/root-disk"} - node_filesystem_free{mountpoint="/root-disk"}) + / node_filesystem_size{mountpoint="/root-disk"} * 100) > 80 + for: 2m + labels: + severity: warning + annotations: + DESCRIPTION: '{{$labels.instance}}: Root disk usage is above 80% (current value + is: {{ $value }})' + SUMMARY: '{{$labels.instance}}: Low root disk space' + + - alert: NodeLowRootDiskCritical + expr: ((node_filesystem_size{mountpoint="/root-disk"} - node_filesystem_free{mountpoint="/root-disk"}) + / node_filesystem_size{mountpoint="/root-disk"} * 100) > 95 + for: 2m + labels: + severity: critical + annotations: + DESCRIPTION: '{{$labels.instance}}: Root disk usage is above 95% (current value + is: {{ $value }})' + SUMMARY: '{{$labels.instance}}: Low root disk space' + + # Swap Usage Alerts + - alert: NodeSwapUsageWarning + expr: (((node_memory_SwapTotal - node_memory_SwapFree) / node_memory_SwapTotal) + * 100) > 80 + for: 3m + labels: + severity: warning + annotations: + DESCRIPTION: '{{$labels.instance}}: Swap usage usage is above 80% (current value + is: {{ $value }})' + SUMMARY: '{{$labels.instance}}: Swap usage detected' + + - alert: NodeSwapUsageCritical + expr: (((node_memory_SwapTotal - node_memory_SwapFree) / node_memory_SwapTotal) + * 100) > 95 + for: 3m + labels: + severity: critical + annotations: + DESCRIPTION: '{{$labels.instance}}: Swap usage usage is above 95% (current value + is: {{ $value }})' + SUMMARY: '{{$labels.instance}}: Swap usage detected' + + # Memory Usage Alerts + - alert: NodeMemoryUsageWarning + expr: (((node_memory_MemTotal - node_memory_MemFree - node_memory_Cached) / (node_memory_MemTotal) + * 100)) > 80 + for: 5m + labels: + severity: warning + annotations: + DESCRIPTION: '{{$labels.instance}}: Memory usage is above 80% (current value + is: {{ $value }})' + SUMMARY: '{{$labels.instance}}: High memory usage detected' + + - alert: NodeMemoryUsageCritical + expr: (((node_memory_MemTotal - node_memory_MemFree - node_memory_Cached) / (node_memory_MemTotal) + * 100)) > 95 + for: 5m + labels: + severity: critical + annotations: + DESCRIPTION: '{{$labels.instance}}: Memory usage is above 95% (current value + is: {{ $value }})' + SUMMARY: '{{$labels.instance}}: High memory usage detected' + + # Latest Milestone Alerts + - alert: LatestMileStoneWarning + expr: iota_node_info_latest_milestone == 243000 + for: 2h + labels: + severity: warning + annotations: + DESCRIPTION: '{{$labels.instance}}: Latest Milestone Reset (current value + is: {{ $value }})' + SUMMARY: '{{$labels.instance}}: Latest Milestone reset and stuck' + + - alert: LatestMileStoneCritical + expr: iota_node_info_latest_milestone == 243000 + for: 4h + labels: + severity: critical + annotations: + DESCRIPTION: '{{$labels.instance}}: Latest Milestone Reset (current value + is: {{ $value }})' + SUMMARY: '{{$labels.instance}}: Latest Milestone reset and stuck' + + - alert: LatestSubtangleMileStoneBehindWarning + expr: (iota_node_info_latest_milestone - iota_node_info_latest_subtangle_milestone) + > 5 + for: 1h + labels: + severity: warning + annotations: + DESCRIPTION: '{{$labels.instance}}: Latest Subtangle Milestone lagging (current value + is: {{ $value }})' + SUMMARY: '{{$labels.instance}}: Latest Subtangle Milestone lagging' + + - alert: LatestSubtangleMileStoneBehindCritical + expr: (iota_node_info_latest_milestone - iota_node_info_latest_subtangle_milestone) + > 10 + for: 1h + labels: + severity: critical + annotations: + DESCRIPTION: '{{$labels.instance}}: Latest Subtangle Milestone lagging (current value + is: {{ $value }})' + SUMMARY: '{{$labels.instance}}: Latest Subtangle Milestone lagging' + + # Neighbor Alerts + - alert: TotalNeighborsFewWarning + expr: iota_node_info_total_neighbors < 2 + for: 1m + labels: + severity: warning + annotations: + DESCRIPTION: '{{$labels.instance}}: Too few neighbors (current value + is: {{ $value }})' + SUMMARY: '{{$labels.instance}}: Too few neighbors' + + - alert: TotalNeighborsActiveWarning + expr: iota_node_info_total_neighbors > 11 + for: 1h + labels: + severity: warning + annotations: + DESCRIPTION: '{{$labels.instance}}: Too many active neighbors (current value + is: {{ $value }})' + SUMMARY: '{{$labels.instance}}: Too many active neighbors' + + - alert: TotalNeighborsActiveCritical + expr: iota_node_info_total_neighbors > 15 + for: 1h + labels: + severity: critical + annotations: + DESCRIPTION: '{{$labels.instance}}: Too many active neighbors (current value + is: {{ $value }})' + SUMMARY: '{{$labels.instance}}: Too many active neighbors' + + - alert: InactiveNeighborsWarning + expr: (iota_node_info_total_neighbors - iota_neighbors_active_neighbors) + > 1 + for: 1h + labels: + severity: warning + annotations: + DESCRIPTION: '{{$labels.instance}}: Inactive Neighbors (current value + is: {{ $value }})' + SUMMARY: '{{$labels.instance}}: Inactive Neighbors' + + - alert: InactiveNeighborsCritical + expr: (iota_node_info_total_neighbors - iota_neighbors_active_neighbors) + > 3 + for: 1h + labels: + severity: critical + annotations: + DESCRIPTION: '{{$labels.instance}}: Inactive Neighbors (current value + is: {{ $value }})' + SUMMARY: '{{$labels.instance}}: Inactive Neighbors' + diff --git a/roles/monitoring/files/email.tmpl b/roles/monitoring/files/email.tmpl new file mode 100644 index 0000000..4f1d648 --- /dev/null +++ b/roles/monitoring/files/email.tmpl @@ -0,0 +1,405 @@ + + + +
+ + ++ |
+
+
+
|
+ + |
+ |
+
+
+
|
+ + |