Skip to content

Commit

Permalink
Finally resolved alerting
Browse files Browse the repository at this point in the history
  • Loading branch information
GilHoggarth committed Dec 8, 2023
1 parent 1c8673d commit 75879b0
Show file tree
Hide file tree
Showing 5 changed files with 33 additions and 37 deletions.
21 changes: 7 additions & 14 deletions monitor/aws/alertmanager/config.yml-template
Original file line number Diff line number Diff line change
@@ -1,23 +1,16 @@
global:
resolve_timeout: 5m
slack_api_url: ${SLACK_API_URL}
slack_api_url: "${SLACK_API_URL}"

# The directory from which notification templates are read.
templates:
- '/etc/alertmanager/template/*.tmpl'

# to slack
route:
- group_by: ['alertname', 'job']
- group_wait: 30s
- group_interval: 5m
- repeat_interval: 12h
receiver: 'slack-webhook'

receiver: slack
group_by: ['alertname', 'job']
group_wait: 30s
group_interval: 5m
repeat_interval: 12h

receivers:
- name: 'slack-webhook'
- name: slack
slack_configs:
- channel: '#alerts'
send_resolved: true

8 changes: 4 additions & 4 deletions monitor/aws/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ version: '3.2'

services:
alertmanager:
image: prom/alertmanager
image: prom/alertmanager:main
ports:
- "${ALERTMANAGER_PORT}:9093"
volumes:
Expand All @@ -15,11 +15,11 @@ services:
- '--web.external-url=http://${ALERTMANAGER_SERVICE_NAME}/'

prometheus:
image: prom/prometheus:v2.30.3
image: prom/prometheus:main
ports:
- "${PROMETHEUS_PORT}:9090"
depends_on:
- alertmanager:alertmanager
- alertmanager
user: root
volumes:
- ./prometheus:/etc/prometheus
Expand All @@ -38,7 +38,7 @@ services:
- '--web.enable-lifecycle'

grafana:
image: grafana/grafana:7.5.7
image: grafana/grafana:main
ports:
- "${GRAFANA_PORT}:3000"
depends_on:
Expand Down
22 changes: 12 additions & 10 deletions monitor/aws/prometheus/alert.rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,22 @@ groups:
summary: "The frequent crawls are not running as fast as expected!"
description: "The frequent crawls do not appear to be running as fast as it should be."

- alert: tidy-logs_no_new_crawl_logs
expr: delta(ukwa_crawler_log_size_bytes{log='crawl.log'}[1h]) == 0 or absent(ukwa_crawler_log_size_bytes{log='crawl.log'})
for: 1h
labels:
severity: severe
annotations:
summary: "No new crawl logs from tidy-logs"
description: "{{ $labels.instance }} of job {{ $labels.job }} failed to run."

- name: Generic metrics
rules:

# Alert for any instance that is unreachable for >5 minutes.
- alert: service_down
expr: up == 0
for: 5m
for: 2m
labels:
severity: severe
annotations:
Expand All @@ -52,21 +62,13 @@ groups:
description: "CPU rate on {{ $labels.instance }} > 80% for more than 10 minutes"

- alert: host_free_disk_space
expr: (node_filesystem_avail_bytes{fstype=~"ext.|xfs|zfs",instance!="jisc03:9100"}/node_filesystem_size_bytes{fstype=~"ext.|xfs|zfs",instance!="jisc03:9100"}) * 100 < 15
expr: (node_filesystem_avail_bytes{fstype=~"ext.|xfs"}/node_filesystem_size_bytes{fstype=~"ext.|xfs"}) * 100 < 15
for: 5m
labels:
severity: severe
annotations:
summary: "Disk space free < 15% on {{ $labels.instance }}"
description: "Disk space free on {{ $labels.instance }} has been less that 15% for more than 5 minutes"
- alert: predict_host_disk_space
expr: min(predict_linear(node_filesystem_free{mountpoint=~"/|/data"}[48h], 2*7*24*3600)) by (instance, mountpoint) < 0
for: 1h
labels:
severity: severe
annotations:
summary: "Disk space running out on {{ $labels.instance }} at {{ $labels.mountpoint }}"
description: "Based on recent sampling, the disk is likely to will fill on volume {{ $labels.mountpoint }} within the next two weeks, for instance: {{ $labels.instance }}."

- alert: cpu_running_too_hot
expr: max(node_hwmon_temp_celsius) by (instance) > 70
Expand Down
16 changes: 8 additions & 8 deletions monitor/aws/prometheus/prometheus.yml-template
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,17 @@ global:


# Specify the location of rules we want prometheus to load
#rule_files:
# - 'alert.rules.yml'
rule_files:
- 'alert.rules.yml'


# prometheus has a dependency on alertmanager, as defined in docker_compose.yml
#alerting:
# alertmanagers:
# - scheme: http
# static_configs:
# - targets:
# - "${ALERTMANAGER_SERVICE_NAME}"
alerting:
alertmanagers:
- scheme: http
static_configs:
- targets:
- "${ALERTMANAGER_SERVICE_NAME}:9093"


# Define which resources prometheus should monitor
Expand Down
3 changes: 2 additions & 1 deletion monitor/aws/start_monitor.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,9 @@ export ALERTMANAGER_SERVICE_NAME='monitor'
export ALERTMANAGER_DATA=${STORAGE_PATH}/alertmanager
export ALERTMANAGER_PORT=9093
[[ -d ${ALERTMANAGER_DATA}/ ]] || mkdir -p ${ALERTMANAGER_DATA}
envsubst < ./alertmanager/config.yml-template > ./alertmanager/config.yml
sudo chown nobody:nobody ${ALERTMANAGER_DATA}
sudo chmod 775 ${ALERTMANAGER_DATA}
envsubst < ./alertmanager/config.yml-template > ./alertmanager/config.yml


# prometheus
Expand Down

0 comments on commit 75879b0

Please sign in to comment.