Skip to content

Commit

Permalink
Removed unavailable alerts; upped cpu rate to 90
Browse files Browse the repository at this point in the history
  • Loading branch information
GilHoggarth committed Dec 9, 2023
1 parent 75879b0 commit b08fb45
Showing 1 changed file with 4 additions and 21 deletions.
25 changes: 4 additions & 21 deletions monitor/aws/prometheus/alert.rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,6 @@ groups:
summary: "The frequent crawls are not running as fast as expected!"
description: "The frequent crawls do not appear to be running as fast as it should be."

- alert: tidy-logs_no_new_crawl_logs
expr: delta(ukwa_crawler_log_size_bytes{log='crawl.log'}[1h]) == 0 or absent(ukwa_crawler_log_size_bytes{log='crawl.log'})
for: 1h
labels:
severity: severe
annotations:
summary: "No new crawl logs from tidy-logs"
description: "{{ $labels.instance }} of job {{ $labels.job }} failed to run."

- name: Generic metrics
rules:
Expand All @@ -53,13 +45,13 @@ groups:
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."

- alert: host_cpu_rate
expr: 100 - 100*avg(irate(node_cpu_seconds_total{mode='idle'}[10m])) > 80
for: 10m
expr: 100 - 100*avg(irate(node_cpu_seconds_total{mode='idle'}[10m])) > 90
for: 15m
labels:
severity: severe
annotations:
summary: "CPU rate on {{ $labels.instance }} > 80%"
description: "CPU rate on {{ $labels.instance }} > 80% for more than 10 minutes"
summary: "CPU rate on {{ $labels.instance }} > 90%"
description: "CPU rate on {{ $labels.instance }} > 90% for more than 15 minutes"

- alert: host_free_disk_space
expr: (node_filesystem_avail_bytes{fstype=~"ext.|xfs"}/node_filesystem_size_bytes{fstype=~"ext.|xfs"}) * 100 < 15
Expand All @@ -69,12 +61,3 @@ groups:
annotations:
summary: "Disk space free < 15% on {{ $labels.instance }}"
description: "Disk space free on {{ $labels.instance }} has been less that 15% for more than 5 minutes"

- alert: cpu_running_too_hot
expr: max(node_hwmon_temp_celsius) by (instance) > 70
for: 30m
labels:
severity: severe
annotations:
summary: "CPU running too hot?"
description: "The CPU on {{ $labels.instance }} is running hot (>70C for 30mins)."

0 comments on commit b08fb45

Please sign in to comment.