-
Notifications
You must be signed in to change notification settings - Fork 0
/
monitoringnew.sh
195 lines (180 loc) · 6.63 KB
/
monitoringnew.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
#!/bin/bash
read -p "Enter NODE name:" NODE
echo 'export NODE='$NODE
read -p "Enter IP server:" IP
echo 'export IP='$IP
read -p "TOKEN telegrambot:" TOKEN
echo 'export TOKEN='$TOKEN
# Установка node_exporter
sudo wget $(curl -s https://api.github.com/repos/prometheus/node_exporter/releases/latest | grep "tag_name" | awk '{print "https://github.com/prometheus/node_exporter/releases/download/" substr($2, 2, length($2)-3) "/node_exporter-" substr($2, 3, length($2)-4) ".linux-amd64.tar.gz"}')
sudo tar xvf node_exporter-*.tar.gz
sudo cp ./node_exporter-*.linux-amd64/node_exporter /usr/local/bin/
sudo useradd --no-create-home --shell /usr/sbin/nologin node_exporter
sudo rm -rf ./node_exporter*
sudo tee /etc/systemd/system/node_exporter.service > /dev/null <<EOF
[Unit]
Description=Node Exporter
Wants=network-online.target
After=network-online.target
[Service]
User=node_exporter
Group=node_exporter
Type=simple
ExecStart=/usr/local/bin/node_exporter
[Install]
WantedBy=multi-user.target
EOF
sudo systemctl daemon-reload
sudo systemctl start node_exporter.service
sudo systemctl enable node_exporter.service
# Установка prometheus
sudo wget $(curl -s https://api.github.com/repos/prometheus/prometheus/releases/latest | grep "tag_name" | awk '{print "https://github.com/prometheus/prometheus/releases/download/" substr($2, 2, length($2)-3) "/prometheus-" substr($2, 3, length($2)-4) ".linux-amd64.tar.gz"}')
sudo tar xvf prometheus-*.tar.gz
sudo cp ./prometheus-*.linux-amd64/prometheus /usr/local/bin/
sudo cp ./prometheus-*.linux-amd64/promtool /usr/local/bin/
sudo cp -r ./prometheus-*.linux-amd64/consoles /etc/prometheus
sudo cp -r ./prometheus-*.linux-amd64/console_libraries /etc/prometheus
sudo useradd --no-create-home --shell /usr/sbin/nologin prometheus
sudo mkdir /var/lib/prometheus
sudo chown -R prometheus:prometheus /etc/prometheus
sudo chown -R prometheus:prometheus /var/lib/prometheus
sudo rm -rf ./prometheus*
sudo tee /etc/prometheus/prometheus.yml > /dev/null <<EOF
global:
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
- 'rules.yml'
alerting:
alertmanagers:
- static_configs:
- targets:
- $IP:9093
scrape_configs:
- job_name: "node_exporter"
scrape_interval: 5s
static_configs:
- targets: ["$IP:9100"]
- job_name: "kusama_node"
scrape_interval: 5s
static_configs:
- targets: ["$IP:9615"]
EOF
sudo tee /etc/systemd/system/prometheus.service > /dev/null <<EOF
[Unit]
Description=Prometheus Monitoring
Wants=network-online.target
After=network-online.target
[Service]
User=prometheus
Group=prometheus
Type=simple
ExecStart=/usr/local/bin/prometheus \
--config.file /etc/prometheus/prometheus.yml \
--storage.tsdb.path /var/lib/prometheus/ \
--web.console.templates=/etc/prometheus/consoles \
--web.console.libraries=/etc/prometheus/console_libraries \
--storage.tsdb.retention.time 30d \
--web.enable-admin-api
ExecReload=/bin/kill -HUP $MAINPID
[Install]
WantedBy=multi-user.target
EOF
cd /etc/prometheus
sudo tee rules.yml > /dev/null <<EOF
groups:
- name: alert_rules
rules:
- alert: KusamaNodeSyncLag
expr: (max(substrate_block_height{status="best"}) by (instance) - max(substrate_block_height{status="finalized"}) by (instance)) > 20
for: 5m
labels:
severity: critical
annotations:
summary: "Node $NODE lagging behind"
description: "Node $NODE is lagging more than 20 blocks behind the network."
- alert: NodeDown
expr: up{job="kusama_node"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Node $NODE down"
description: "Node $NODE has been down for more than 1 minute."
- alert: HighDiskUsage
expr: (node_filesystem_avail_bytes{job="node_exporter", fstype!="tmpfs", fstype!="sysfs", fstype!="proc"} / node_filesystem_size_bytes{job="node_exporter", fstype!="tmpfs", fstype!="sysfs", fstype!="proc"}) * 100 < 5
for: 5m
labels:
severity: critical
annotations:
summary: "High disk usage on $NODE"
description: "Disk usage is above 95% on $NODE."
- alert: KusamaNodeNotSyncing
expr: substrate_sub_libp2p_sync_is_major_syncing{job="kusama_node"} == 1
for: 5m
labels:
severity: critical
annotations:
summary: "Node $NODE not syncing"
description: "Node $NODE is not syncing blocks for more than 5 minutes."
- alert: KusamaNodeHighCPUUsage
expr: rate(process_cpu_seconds_total{job="kusama_node"}[5m]) > 0.8
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU usage on $NODE"
description: "CPU usage is above 80% on $NODE for more than 5 minutes."
EOF
sudo chown prometheus:prometheus rules.yml
sudo systemctl daemon-reload
sudo systemctl start prometheus.service
sudo systemctl enable prometheus.service
# Установка alertmanager
cd ~
sudo wget https://github.com/prometheus/alertmanager/releases/download/v0.24.0/alertmanager-0.24.0.linux-amd64.tar.gz
sudo tar xvf alertmanager-0.24.0.linux-amd64.tar.gz
sudo rm alertmanager-0.24.0.linux-amd64.tar.gz
sudo mkdir /etc/alertmanager /var/lib/prometheus/alertmanager
cd alertmanager-0.24.0.linux-amd64
sudo cp alertmanager amtool /usr/local/bin/
sudo cp alertmanager.yml /etc/alertmanager
sudo useradd --no-create-home --shell /bin/false alertmanager
sudo chown -R alertmanager:alertmanager /etc/alertmanager /var/lib/prometheus/alertmanager
sudo chown alertmanager:alertmanager /usr/local/bin/{alertmanager,amtool}
sudo tee /etc/systemd/system/alertmanager.service > /dev/null <<EOF
[Unit]
Description=AlertManager Server Service
Wants=network-online.target
After=network-online.target
[Service]
User=root
Group=root
Type=simple
ExecStart=/usr/local/bin/alertmanager --config.file /etc/alertmanager/alertmanager.yml --web.external-url=http://$IP:9093 --cluster.advertise-address='0.0.0.0:9093'
[Install]
WantedBy=multi-user.target
EOF
sudo tee /etc/alertmanager/alertmanager.yml > /dev/null <<EOF
route:
group_by: ['alertname', 'instance', 'severity']
group_wait: 30s
group_interval: 5m
repeat_interval: 1h
receiver: 'telepush'
receivers:
- name: 'telepush'
webhook_configs:
- url: 'https://telepush.dev/api/inlets/alertmanager/$TOKEN'
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
EOF
sudo systemctl daemon-reload
sudo systemctl enable alertmanager
sudo systemctl start alertmanager
sudo systemctl restart prometheus.service
sudo systemctl restart alertmanager.service