From ac350172f1b519578307b6a1c6b238cb7d54b903 Mon Sep 17 00:00:00 2001 From: Nuriel Shem-Tov Date: Thu, 21 Dec 2017 12:53:14 +0000 Subject: [PATCH] Added alertmanager and prometheus alerting rules --- group_vars/all/monitoring.yml | 11 + roles/monitoring/files/alert.rules.yml | 224 ++++++++++ roles/monitoring/files/email.tmpl | 405 +++++++++++++++++ roles/monitoring/handlers/main.yml | 6 + roles/monitoring/tasks/alertmanager.yml | 49 +++ roles/monitoring/tasks/deps.yml | 119 +++-- roles/monitoring/tasks/firewall.yml | 5 +- roles/monitoring/tasks/prometheus.yml | 5 + roles/monitoring/tasks/role.yml | 4 + .../templates/alertmanager.cfg.yml.j2 | 56 +++ .../monitoring/templates/alertmanager.conf.j2 | 21 + .../templates/alertmanager.service.j2 | 29 ++ roles/monitoring/templates/email.tmpl.j2 | 410 ++++++++++++++++++ roles/monitoring/templates/prometheus.yaml.j2 | 10 + 14 files changed, 1315 insertions(+), 39 deletions(-) create mode 100644 roles/monitoring/files/alert.rules.yml create mode 100644 roles/monitoring/files/email.tmpl create mode 100644 roles/monitoring/tasks/alertmanager.yml create mode 100644 roles/monitoring/templates/alertmanager.cfg.yml.j2 create mode 100644 roles/monitoring/templates/alertmanager.conf.j2 create mode 100644 roles/monitoring/templates/alertmanager.service.j2 create mode 100644 roles/monitoring/templates/email.tmpl.j2 diff --git a/group_vars/all/monitoring.yml b/group_vars/all/monitoring.yml index f35969a..7391408 100644 --- a/group_vars/all/monitoring.yml +++ b/group_vars/all/monitoring.yml @@ -17,3 +17,14 @@ nodesource_version: 8 iota_prom_exporter_basedir: iota-prom-exporter iota_prom_exporter_port: 9311 iota_prom_exporter_bind: 127.0.0.1 + +alertmanager_basedir: /opt/prometheus/alertmanager +alertmanager_version: 0.12.0 +alertmanager_port: 9093 +alertmanager_nginx_port: 9993 +alertmanager_bind: 127.0.0.1 +alertmanager_email_from: alertmanager +alertmanager_email_to: root@localhost +alertmanager_loglevel: info +smtp_host: localhost +smtp_port: 25 diff --git a/roles/monitoring/files/alert.rules.yml b/roles/monitoring/files/alert.rules.yml new file mode 100644 index 0000000..3ddcf44 --- /dev/null +++ b/roles/monitoring/files/alert.rules.yml @@ -0,0 +1,224 @@ +groups: +- name: alert.rules + rules: + + # Since we're configuring prometheus on this host, I doubt this alert will ever trigger. + - alert: InstanceDown + expr: up == 0 + for: 1m + labels: + severity: critical + annotations: + description: '{{ $labels.instance }} of job {{ $labels.job }} has been down + for more than 1 minute.' + summary: Instance {{ $labels.instance }} down + + # CPU Usage Alerts + - alert: NodeCPUUsageWarning + expr: (100 - (avg(irate(node_cpu{mode="idle",name="node-exporter"}[5m])) BY (instance) + * 100)) > 75 + for: 3m + labels: + severity: warning + annotations: + DESCRIPTION: '{{$labels.instance}}: CPU usage is above 75% (current value is: + {{ $value }})' + SUMMARY: '{{$labels.instance}}: High CPU usage detected' + + - alert: NodeCPUUsageCritical + expr: (100 - (avg(irate(node_cpu{mode="idle",name="node-exporter"}[5m])) BY (instance) + * 100)) > 90 + for: 3m + labels: + severity: critical + annotations: + DESCRIPTION: '{{$labels.instance}}: CPU usage is above 90% (current value is: + {{ $value }})' + SUMMARY: '{{$labels.instance}}: High CPU usage detected' + + + # Load avg Alerts + - alert: NodeLoadAverageWarning + expr: ((node_load5 / count(node_cpu{mode="system"}) WITHOUT (cpu, mode)) > 4) + for: 5m + labels: + severity: warning + annotations: + DESCRIPTION: '{{$labels.instance}}: LA is high' + SUMMARY: '{{$labels.instance}}: High LA detected' + + - alert: NodeLoadAverageCritical + expr: ((node_load5 / count(node_cpu{mode="system"}) WITHOUT (cpu, mode)) > 7) + for: 5m + labels: + severity: critical + annotations: + DESCRIPTION: '{{$labels.instance}}: LA is very high' + SUMMARY: '{{$labels.instance}}: Very high LA detected' + + + # Low Disk Space Alerts + - alert: NodeLowRootDiskWarning + expr: ((node_filesystem_size{mountpoint="/root-disk"} - node_filesystem_free{mountpoint="/root-disk"}) + / node_filesystem_size{mountpoint="/root-disk"} * 100) > 80 + for: 2m + labels: + severity: warning + annotations: + DESCRIPTION: '{{$labels.instance}}: Root disk usage is above 80% (current value + is: {{ $value }})' + SUMMARY: '{{$labels.instance}}: Low root disk space' + + - alert: NodeLowRootDiskCritical + expr: ((node_filesystem_size{mountpoint="/root-disk"} - node_filesystem_free{mountpoint="/root-disk"}) + / node_filesystem_size{mountpoint="/root-disk"} * 100) > 95 + for: 2m + labels: + severity: critical + annotations: + DESCRIPTION: '{{$labels.instance}}: Root disk usage is above 95% (current value + is: {{ $value }})' + SUMMARY: '{{$labels.instance}}: Low root disk space' + + # Swap Usage Alerts + - alert: NodeSwapUsageWarning + expr: (((node_memory_SwapTotal - node_memory_SwapFree) / node_memory_SwapTotal) + * 100) > 80 + for: 3m + labels: + severity: warning + annotations: + DESCRIPTION: '{{$labels.instance}}: Swap usage usage is above 80% (current value + is: {{ $value }})' + SUMMARY: '{{$labels.instance}}: Swap usage detected' + + - alert: NodeSwapUsageCritical + expr: (((node_memory_SwapTotal - node_memory_SwapFree) / node_memory_SwapTotal) + * 100) > 95 + for: 3m + labels: + severity: critical + annotations: + DESCRIPTION: '{{$labels.instance}}: Swap usage usage is above 95% (current value + is: {{ $value }})' + SUMMARY: '{{$labels.instance}}: Swap usage detected' + + # Memory Usage Alerts + - alert: NodeMemoryUsageWarning + expr: (((node_memory_MemTotal - node_memory_MemFree - node_memory_Cached) / (node_memory_MemTotal) + * 100)) > 80 + for: 5m + labels: + severity: warning + annotations: + DESCRIPTION: '{{$labels.instance}}: Memory usage is above 80% (current value + is: {{ $value }})' + SUMMARY: '{{$labels.instance}}: High memory usage detected' + + - alert: NodeMemoryUsageCritical + expr: (((node_memory_MemTotal - node_memory_MemFree - node_memory_Cached) / (node_memory_MemTotal) + * 100)) > 95 + for: 5m + labels: + severity: critical + annotations: + DESCRIPTION: '{{$labels.instance}}: Memory usage is above 95% (current value + is: {{ $value }})' + SUMMARY: '{{$labels.instance}}: High memory usage detected' + + # Latest Milestone Alerts + - alert: LatestMileStoneWarning + expr: iota_node_info_latest_milestone == 243000 + for: 2h + labels: + severity: warning + annotations: + DESCRIPTION: '{{$labels.instance}}: Latest Milestone Reset (current value + is: {{ $value }})' + SUMMARY: '{{$labels.instance}}: Latest Milestone reset and stuck' + + - alert: LatestMileStoneCritical + expr: iota_node_info_latest_milestone == 243000 + for: 4h + labels: + severity: critical + annotations: + DESCRIPTION: '{{$labels.instance}}: Latest Milestone Reset (current value + is: {{ $value }})' + SUMMARY: '{{$labels.instance}}: Latest Milestone reset and stuck' + + - alert: LatestSubtangleMileStoneBehindWarning + expr: (iota_node_info_latest_milestone - iota_node_info_latest_subtangle_milestone) + > 5 + for: 1h + labels: + severity: warning + annotations: + DESCRIPTION: '{{$labels.instance}}: Latest Subtangle Milestone lagging (current value + is: {{ $value }})' + SUMMARY: '{{$labels.instance}}: Latest Subtangle Milestone lagging' + + - alert: LatestSubtangleMileStoneBehindCritical + expr: (iota_node_info_latest_milestone - iota_node_info_latest_subtangle_milestone) + > 10 + for: 1h + labels: + severity: critical + annotations: + DESCRIPTION: '{{$labels.instance}}: Latest Subtangle Milestone lagging (current value + is: {{ $value }})' + SUMMARY: '{{$labels.instance}}: Latest Subtangle Milestone lagging' + + # Neighbor Alerts + - alert: TotalNeighborsFewWarning + expr: iota_node_info_total_neighbors < 2 + for: 1m + labels: + severity: warning + annotations: + DESCRIPTION: '{{$labels.instance}}: Too few neighbors (current value + is: {{ $value }})' + SUMMARY: '{{$labels.instance}}: Too few neighbors' + + - alert: TotalNeighborsActiveWarning + expr: iota_node_info_total_neighbors > 11 + for: 1h + labels: + severity: warning + annotations: + DESCRIPTION: '{{$labels.instance}}: Too many active neighbors (current value + is: {{ $value }})' + SUMMARY: '{{$labels.instance}}: Too many active neighbors' + + - alert: TotalNeighborsActiveCritical + expr: iota_node_info_total_neighbors > 15 + for: 1h + labels: + severity: critical + annotations: + DESCRIPTION: '{{$labels.instance}}: Too many active neighbors (current value + is: {{ $value }})' + SUMMARY: '{{$labels.instance}}: Too many active neighbors' + + - alert: InactiveNeighborsWarning + expr: (iota_node_info_total_neighbors - iota_neighbors_active_neighbors) + > 1 + for: 1h + labels: + severity: warning + annotations: + DESCRIPTION: '{{$labels.instance}}: Inactive Neighbors (current value + is: {{ $value }})' + SUMMARY: '{{$labels.instance}}: Inactive Neighbors' + + - alert: InactiveNeighborsCritical + expr: (iota_node_info_total_neighbors - iota_neighbors_active_neighbors) + > 3 + for: 1h + labels: + severity: critical + annotations: + DESCRIPTION: '{{$labels.instance}}: Inactive Neighbors (current value + is: {{ $value }})' + SUMMARY: '{{$labels.instance}}: Inactive Neighbors' + diff --git a/roles/monitoring/files/email.tmpl b/roles/monitoring/files/email.tmpl new file mode 100644 index 0000000..4f1d648 --- /dev/null +++ b/roles/monitoring/files/email.tmpl @@ -0,0 +1,405 @@ + + + + + + +{{ template "__subject" . }} + + + + + + + + + + + +
+
+ + + + + + + +
+ {{ .Alerts | len }} alert{{ if gt (len .Alerts) 1 }}s{{ end }} for {{ range .GroupLabels.SortedPairs }} + {{ .Name }}={{ .Value }} + {{ end }} +
+ + + + + {{ if gt (len .Alerts.Firing) 0 }} + + + + {{ end }} + {{ range .Alerts.Firing }} + + + + {{ end }} + + {{ if gt (len .Alerts.Resolved) 0 }} + {{ if gt (len .Alerts.Firing) 0 }} + + + + {{ end }} + + + + {{ end }} + {{ range .Alerts.Resolved }} + + + + {{ end }} +
+ View in {{ template "__alertmanager" . }} +
+ [{{ .Alerts.Firing | len }}] Firing +
+ Labels
+ {{ range .Labels.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} + {{ if gt (len .Annotations) 0 }}Annotations
{{ end }} + {{ range .Annotations.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} + Source
+
+
+
+
+
+ [{{ .Alerts.Resolved | len }}] Resolved +
+ Labels
+ {{ range .Labels.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} + {{ if gt (len .Annotations) 0 }}Annotations
{{ end }} + {{ range .Annotations.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} + Source
+
+
+ +
+
+ + + + diff --git a/roles/monitoring/handlers/main.yml b/roles/monitoring/handlers/main.yml index 3ea28b5..c895eda 100644 --- a/roles/monitoring/handlers/main.yml +++ b/roles/monitoring/handlers/main.yml @@ -10,6 +10,12 @@ state: restarted enabled: yes +- name: restart alertmanager + systemd: + name: alertmanager.service + state: restarted + enabled: yes + - name: restart iota-prom-exporter systemd: name: iota-prom-exporter.service diff --git a/roles/monitoring/tasks/alertmanager.yml b/roles/monitoring/tasks/alertmanager.yml new file mode 100644 index 0000000..aceda3c --- /dev/null +++ b/roles/monitoring/tasks/alertmanager.yml @@ -0,0 +1,49 @@ +- name: ensure template directory exists for alertmanager + file: + path: "{{ alertmanager_basedir }}/template" + state: directory + +- name: copy email template for alertmanager + template: + src: templates/email.tmpl.j2 + dest: "{{ alertmanager_basedir }}/template/email.tmpl" + force: "{{ overwrite | default('no') }}" + backup: yes + +- name: copy alertmanager systemd service file + template: + src: templates/alertmanager.service.j2 + dest: /etc/systemd/system/alertmanager.service + notify: + - reload systemd + +- name: flush handlers + meta: flush_handlers + +- name: copy alertmanager config + template: + force: "{{ overwrite | default('no') }}" + backup: yes + src: templates/alertmanager.cfg.yml.j2 + dest: "{{ alertmanager_basedir }}/config.yml" + mode: 0600 + owner: "{{ prom_username }}" + group: "{{ prom_username }}" + notify: + - restart alertmanager + +- name: ensure alertmanager enabled and started + systemd: + name: alertmanager.service + state: started + enabled: yes + +- name: copy alertmanager nginx config + template: + src: templates/alertmanager.conf.j2 + dest: /etc/nginx/conf.d/alertmanager.conf + validate: "/usr/local/bin/validate_nginx_config.sh -t %s -d conf.d/alertmanager.conf -r" + when: install_nginx is defined and install_nginx + notify: + - reload nginx + diff --git a/roles/monitoring/tasks/deps.yml b/roles/monitoring/tasks/deps.yml index 3d58d46..b282ceb 100644 --- a/roles/monitoring/tasks/deps.yml +++ b/roles/monitoring/tasks/deps.yml @@ -49,6 +49,32 @@ owner: "{{ prom_username }}" group: "{{ prom_username }}" +# Re-download alertmanager? +- name: stat alertmanager basedir + stat: + path: "{{ alertmanager_basedir }}" + register: alertmanager_basedir_stat + +- name: download and unarchive alertmanager release + unarchive: + src: "https://github.com/prometheus/alertmanager/releases/download/v{{ alertmanager_version }}/alertmanager-{{ alertmanager_version }}.linux-amd64.tar.gz" + dest: /tmp + remote_src: True + when: not alertmanager_basedir_stat.stat.exists + register: alertmanager_downloaded + +- name: move temporary extracted alertmanager directory to its basedir + shell: "mv /tmp/alertmanager-{{ alertmanager_version }}.linux-amd64 {{ alertmanager_basedir }}" + when: alertmanager_downloaded and alertmanager_downloaded.changed + +- name: ensure alertmanager basedir ownership and permissions + file: + path: "{{ alertmanager_basedir }}" + state: directory + mode: 0700 + owner: "{{ prom_username }}" + group: "{{ prom_username }}" + # Re-download node_exporter? - name: stat node exporter basedir stat: @@ -75,50 +101,67 @@ owner: "{{ prom_username }}" group: "{{ prom_username }}" -- name: Add an Apt signing key for grafana repo - apt_key: - url: https://packagecloud.io/gpg.key - state: present - when: ansible_distribution == 'Debian' or ansible_distribution == 'Ubuntu' - -- name: add grafana apt repository - apt_repository: - repo: deb https://packagecloud.io/grafana/stable/debian/ jessie main - state: present - when: ansible_distribution == 'Debian' or ansible_distribution == 'Ubuntu' - -- name: Install grafana - apt: - state: latest - name: grafana - update_cache: yes +# Apt installs for Ubuntu +- block: + + - name: Add an Apt signing key for grafana repo + apt_key: + url: https://packagecloud.io/gpg.key + state: present + + - name: add grafana apt repository + apt_repository: + repo: deb https://packagecloud.io/grafana/stable/debian/ jessie main + state: present + + - name: Install grafana and postfix + apt: + state: latest + name: "{{ item }}" + update_cache: yes + with_items: + - grafana + - postfix + - mutt + - mailutils + notify: + - reload systemd when: ansible_distribution == 'Debian' or ansible_distribution == 'Ubuntu' - notify: - - reload systemd # Required -- name: Install epel-release - yum: state=latest name=epel-release - when: ansible_distribution == 'CentOS' or ansible_distribution == 'Red Hat Enterprise Linux' +- block: + + - name: Install epel-release + yum: state=latest name=epel-release + + - name: add grafana repository + yum_repository: + name: grafana + description: grafana + baseurl: https://packagecloud.io/grafana/stable/el/6/$basearch + gpgcheck: yes + enabled: yes + sslcacert: /etc/pki/tls/certs/ca-bundle.crt + repo_gpgcheck: yes + gpgkey: 'https://packagecloud.io/gpg.key https://grafanarel.s3.amazonaws.com/RPM-GPG-KEY-grafana' + + - name: Install grafana and postfix + yum: + name: "{{ item }}" + state: latest + with_items: + - grafana + - mutt + - postfix + - mailx -- name: add grafana repository - yum_repository: - name: grafana - description: grafana - baseurl: https://packagecloud.io/grafana/stable/el/6/$basearch - gpgcheck: yes - enabled: yes - sslcacert: /etc/pki/tls/certs/ca-bundle.crt - repo_gpgcheck: yes - gpgkey: 'https://packagecloud.io/gpg.key https://grafanarel.s3.amazonaws.com/RPM-GPG-KEY-grafana' - when: ansible_distribution == 'CentOS' or ansible_distribution == 'Red Hat Enterprise Linux' - -- name: Install grafana - yum: - name: grafana - state: latest when: ansible_distribution == 'CentOS' or ansible_distribution == 'Red Hat Enterprise Linux' - name: flush handlers meta: flush_handlers +- name: Ensure postfix is started and enabled + systemd: + name: postfix + state: started + enabled: yes diff --git a/roles/monitoring/tasks/firewall.yml b/roles/monitoring/tasks/firewall.yml index a43e374..16fa889 100644 --- a/roles/monitoring/tasks/firewall.yml +++ b/roles/monitoring/tasks/firewall.yml @@ -14,11 +14,12 @@ with_items: - "{{ prom_nginx_port }}" - "{{ grafana_nginx_port }}" + - "{{ alertmanager_nginx_port }}" when: > install_nginx is defined and install_nginx and (ansible_distribution == 'CentOS' or ansible_distribution == 'Red Hat Enterprise Linux') -- name: allow prometheus grafana aginx port via selinux seport +- name: allow prometheus grafana nginx port via selinux seport seport: ports: "{{ item }}" proto: tcp @@ -27,6 +28,7 @@ with_items: - "{{ prom_nginx_port }}" - "{{ grafana_nginx_port }}" + - "{{ alertmanager_nginx_port }}" when: > install_nginx is defined and install_nginx and (ansible_distribution == 'CentOS' or ansible_distribution == 'Red Hat Enterprise Linux') @@ -49,6 +51,7 @@ with_items: - "{{ prom_nginx_port }}" - "{{ grafana_nginx_port }}" + - "{{ alertmanager_nginx_port }}" when: > install_nginx is defined and install_nginx and (ansible_distribution == 'Debian' or ansible_distribution == 'Ubuntu') diff --git a/roles/monitoring/tasks/prometheus.yml b/roles/monitoring/tasks/prometheus.yml index 82777ef..3b7f8b9 100644 --- a/roles/monitoring/tasks/prometheus.yml +++ b/roles/monitoring/tasks/prometheus.yml @@ -37,6 +37,11 @@ notify: - restart prometheus +- name: copy alerting rules + copy: + src: files/alert.rules.yml + dest: "{{ prom_configdir }}/alert.rules.yml" + - name: ensure prometheus enabled and started systemd: name: prometheus.service diff --git a/roles/monitoring/tasks/role.yml b/roles/monitoring/tasks/role.yml index 8b13498..a2589bd 100644 --- a/roles/monitoring/tasks/role.yml +++ b/roles/monitoring/tasks/role.yml @@ -10,6 +10,10 @@ tags: - prometheus_config +- import_tasks: alertmanager.yml + tags: + - alertmanager_config + - import_tasks: node_exporter.yml tags: - node_exporter_config diff --git a/roles/monitoring/templates/alertmanager.cfg.yml.j2 b/roles/monitoring/templates/alertmanager.cfg.yml.j2 new file mode 100644 index 0000000..da69e70 --- /dev/null +++ b/roles/monitoring/templates/alertmanager.cfg.yml.j2 @@ -0,0 +1,56 @@ +--- +global: + smtp_smarthost: {{ smtp_host }}:{{ smtp_port }} + smtp_from: {{ alertmanager_email_from }}@{{ ansible_hostname }} + # For gmail this is required to be true + # and for any other ssl encrypted port + smtp_require_tls: false + pagerduty_url: '' + hipchat_api_url: '' + opsgenie_api_url: '' + wechat_api_url: '' + victorops_api_url: '' +route: + group_by: [Alertname] + repeat_interval: 1h + receiver: email-me +templates: + - {{ alertmanager_basedir }}/template/*.tmpl +receivers: + +# Send using postfix local mailer +# You can send to a gmail or hotmail address +# but these will most probably be put into junkmail +# unles you configure your DNS and the from address +- name: email-me + email_configs: + - to: {{ alertmanager_email_to }} + from: {{ alertmanager_email_from }}@{{ ansible_hostname }} + html: {% raw %}{{ template "email.tmpl" . }}{% endraw %} + smarthost: {{ smtp_host }}:{{ smtp_port }} + +# For gmail, replace the variables/placeholders with your data +#- name: email-me +# email_configs: +# - to: $GMAIL_ACCOUNT +# from: $GMAIL_ACCOUNT +# smarthost: smtp.gmail.com:587 +# auth_username: "$GMAIL_ACCOUNT" +# auth_identity: "$GMAIL_ACCOUNT" +# auth_password: "$GMAIL_AUTH_TOKEN" + +# For slack notifications, replace the variables/placeholders with your data +#- name: slack +# slack_configs: +# - api_url: https://hooks.slack.com/services/XX/XXX/XXXX +# channel: "#iota" +# send_resolved: true +# username: myname + +inhibit_rules: +- source_match: + severity: critical + target_match: + severity: warning + equal: + - Alertname diff --git a/roles/monitoring/templates/alertmanager.conf.j2 b/roles/monitoring/templates/alertmanager.conf.j2 new file mode 100644 index 0000000..b9acc02 --- /dev/null +++ b/roles/monitoring/templates/alertmanager.conf.j2 @@ -0,0 +1,21 @@ +upstream alertmanager { + server 127.0.0.1:{{ alertmanager_port }}; +} + +server { + listen {{ alertmanager_nginx_port }} default_server; + server_name _; + server_tokens off; + + auth_basic "Restricted"; + auth_basic_user_file /etc/nginx/.htpasswd; + + location / { + proxy_pass http://alertmanager; + proxy_http_version 1.1; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection 'upgrade'; + proxy_set_header Host $host; + proxy_cache_bypass $http_upgrade; + } +} diff --git a/roles/monitoring/templates/alertmanager.service.j2 b/roles/monitoring/templates/alertmanager.service.j2 new file mode 100644 index 0000000..d8f5d6c --- /dev/null +++ b/roles/monitoring/templates/alertmanager.service.j2 @@ -0,0 +1,29 @@ +[Unit] +Description=Alertmanager for Prometheus +Wants=network-online.target +After=network.target + +[Service] +WorkingDirectory={{ alertmanager_basedir }} +Restart=on-failure +ExecStart={{ alertmanager_basedir }}/alertmanager -config.file config.yml -web.listen-address {{ alertmanager_bind }}:{{ alertmanager_port }} -web.external-url http://{{ ansible_default_ipv4.address }}:{{ alertmanager_nginx_port }} -mesh.listen-address "" -log.level {{ alertmanager_loglevel }} +Type=simple + +# No need that exporter messes with /dev +PrivateDevices=yes + +# Dedicated /tmp +PrivateTmp=yes + +# Make /usr, /boot, /etc read only +ProtectSystem=full + +# /home is not accessible at all +ProtectHome=yes + +# Unprivileged user +User={{ prom_username }} +Group={{ prom_username }} + +[Install] +WantedBy=multi-user.target diff --git a/roles/monitoring/templates/email.tmpl.j2 b/roles/monitoring/templates/email.tmpl.j2 new file mode 100644 index 0000000..619607f --- /dev/null +++ b/roles/monitoring/templates/email.tmpl.j2 @@ -0,0 +1,410 @@ +{% raw %} + + + + + + +{{ template "__subject" . }} + + + + + + + + + + + +
+
+ + + + + + + +
+ {{ .Alerts | len }} alert{{ if gt (len .Alerts) 1 }}s{{ end }} for {{ range .GroupLabels.SortedPairs }} + {{ .Name }}={{ .Value }} + {{ end }} +
+ + + +{% endraw %} + {% raw %} + + {{ if gt (len .Alerts.Firing) 0 }} + + + + {{ end }} + {{ range .Alerts.Firing }} + + + + {{ end }} + + {{ if gt (len .Alerts.Resolved) 0 }} + {{ if gt (len .Alerts.Firing) 0 }} + + + + {{ end }} + + + + {{ end }} + {{ range .Alerts.Resolved }} + + + + {{ end }} +
+ View in {{ template "__alertmanager" . }} + + View in Grafana +
+ [{{ .Alerts.Firing | len }}] Firing +
+ Labels
+ {{ range .Labels.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} + {{ if gt (len .Annotations) 0 }}Annotations
{{ end }} + {{ range .Annotations.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} + Source
+
+
+
+
+
+ [{{ .Alerts.Resolved | len }}] Resolved +
+ Labels
+ {{ range .Labels.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} + {{ if gt (len .Annotations) 0 }}Annotations
{{ end }} + {{ range .Annotations.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} + Source
+
+
+ +
+
+ + + +{% endraw %} diff --git a/roles/monitoring/templates/prometheus.yaml.j2 b/roles/monitoring/templates/prometheus.yaml.j2 index 79945b3..59ec6c3 100644 --- a/roles/monitoring/templates/prometheus.yaml.j2 +++ b/roles/monitoring/templates/prometheus.yaml.j2 @@ -16,3 +16,13 @@ scrape_configs: scrape_interval: 5s static_configs: - targets: ['localhost:{{ iota_prom_exporter_port }}'] + +alerting: + alertmanagers: + - scheme: http + static_configs: + - targets: + - "{{ alertmanager_bind }}:{{ alertmanager_port }}" + +rule_files: + - "{{ prom_configdir }}/alert.rules.yml"