forked from network-observability/network-observability-lab
-
Notifications
You must be signed in to change notification settings - Fork 0
/
alerting_rules.yml
56 lines (54 loc) · 2.15 KB
/
alerting_rules.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
---
groups:
- name: Peer Interface Down
rules:
- alert: PeerInterfaceDown
expr: interface_oper_status{intf_role="peer"} == 2
for: 1m
labels:
severity: warning
source: stack
environment: network-observability-lab
metric_name: interface_oper_status
device: '{{ $labels.device }}'
device_role: '{{ $labels.device_role }}'
site: '{{ $labels.site }}'
region: '{{ $labels.region }}'
instance: '{{ $labels.host }}'
device_platform: '{{ $labels.device_platform }}_{{ $labels.net_os }}'
annotations:
summary: "[NET] Device {{ $labels.device }}: Interface Uplink {{ $labels.name }} is down"
description: "Interface {{ $labels.name }} on device {{ $labels.device }} is down!"
- name: BGP Neighbor Down
rules:
- alert: BGPNeighborDown
expr: bgp_neighbor_state == 2
for: 1m
labels:
severity: critical
source: stack
environment: network-observability-lab
metric_name: bgp_neighbor_state
device: '{{ $labels.device }}'
device_role: '{{ $labels.device_role }}'
site: '{{ $labels.site }}'
region: '{{ $labels.region }}'
instance: '{{ $labels.host }}'
device_platform: '{{ $labels.device_platform }}_{{ $labels.net_os }}'
annotations:
summary: "[NET] Device {{ $labels.device }}: BGP Neighbor {{ $labels.neighbor }}:{{ $labels.neighbor_asn }} is down"
description: "BGP Neighbor towards {{ $labels.neighbor }}:{{ $labels.neighbor_asn }} on device {{ $labels.device }} is down!"
#### Meta Monitoring
- name: Meta Monitoring
rules:
- alert: ServiceDown
expr: netobs_health_result_code != 0
for: 10s
labels:
severity: critical
environment: network-observability-lab
service: '{{ $labels.service }}'
instance: '{{ $labels.host }}'
annotations:
summary: "[META] Serivce {{ $labels.service }} is down"
description: "Service {{ $labels.service }} checked with {{ $labels.host }} is down!"