-
Notifications
You must be signed in to change notification settings - Fork 165
/
localnet-sanity.sh
executable file
·382 lines (332 loc) · 9.2 KB
/
localnet-sanity.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
#!/usr/bin/env bash
set -e
skipSetup=false
iterations=1
restartInterval=never
rollingRestart=false
extraNodes=0
walletRpcPort=:8899
usage() {
exitcode=0
if [[ -n "$1" ]]; then
exitcode=1
echo "Error: $*"
fi
cat <<EOF
usage: $0 [options...]
Start a local cluster and run sanity on it
options:
-i [number] - Number of times to run sanity (default: $iterations)
-k [number] - Restart the cluster after this number of sanity iterations (default: $restartInterval)
-R - Restart the cluster by incrementially stopping and restarting
nodes (at the cadence specified by -k). When disabled all
nodes will be first killed then restarted (default: $rollingRestart)
-b - Disable leader rotation
-x - Add an extra validator (may be supplied multiple times)
-r - Select the RPC endpoint hosted by a node that starts as
a validator node. If unspecified the RPC endpoint hosted by
the bootstrap validator will be used.
-c - Reuse existing node/ledger configuration from a previous sanity
run
EOF
exit $exitcode
}
cd "$(dirname "$0")"/..
while getopts "ch?i:k:brxR" opt; do
case $opt in
h | \?)
usage
;;
c)
skipSetup=true
;;
i)
iterations=$OPTARG
;;
k)
restartInterval=$OPTARG
;;
x)
extraNodes=$((extraNodes + 1))
;;
r)
walletRpcPort=":18899"
;;
R)
rollingRestart=true
;;
*)
usage "Error: unhandled option: $opt"
;;
esac
done
source ci/upload-ci-artifact.sh
source scripts/configure-metrics.sh
source multinode-demo/common.sh --prebuild
nodes=(
"multinode-demo/bootstrap-validator.sh \
--no-restart \
--init-complete-file init-complete-node0.log \
--dynamic-port-range 8000-8050"
"multinode-demo/validator.sh \
--no-restart \
--dynamic-port-range 8050-8100
--init-complete-file init-complete-node1.log \
--rpc-port 18899"
)
if [[ extraNodes -gt 0 ]]; then
for i in $(seq 1 $extraNodes); do
portStart=$((8100 + i * 50))
portEnd=$((portStart + 49))
nodes+=(
"multinode-demo/validator.sh \
--no-restart \
--dynamic-port-range $portStart-$portEnd
--label dyn$i \
--init-complete-file init-complete-node$((1 + i)).log"
)
done
fi
numNodes=$((2 + extraNodes))
pids=()
logs=()
getNodeLogFile() {
declare nodeIndex=$1
declare cmd=$2
declare baseCmd
baseCmd=$(basename "${cmd// */}" .sh)
echo "log-$baseCmd-$nodeIndex.txt"
}
startNode() {
declare nodeIndex=$1
declare cmd=$2
echo "--- Start $cmd"
declare log
log=$(getNodeLogFile "$nodeIndex" "$cmd")
rm -f "$log"
$cmd > "$log" 2>&1 &
declare pid=$!
pids+=("$pid")
echo "pid: $pid"
echo "log: $log"
}
waitForNodeToInit() {
declare initCompleteFile=$1
while [[ ! -r $initCompleteFile ]]; do
if [[ $SECONDS -ge 300 ]]; then
echo "^^^ +++"
echo "Error: $initCompleteFile not found in $SECONDS seconds"
exit 1
fi
echo "Waiting for $initCompleteFile ($SECONDS)..."
sleep 2
done
echo "Found $initCompleteFile"
}
initCompleteFiles=()
waitForAllNodesToInit() {
echo "--- ${#initCompleteFiles[@]} nodes booting"
SECONDS=
for initCompleteFile in "${initCompleteFiles[@]}"; do
waitForNodeToInit "$initCompleteFile"
done
echo "All nodes finished booting in $SECONDS seconds"
}
startNodes() {
declare addLogs=false
if [[ ${#logs[@]} -eq 0 ]]; then
addLogs=true
fi
initCompleteFiles=()
maybeExpectedGenesisHash=
for i in $(seq 0 $((${#nodes[@]} - 1))); do
declare cmd=${nodes[$i]}
declare initCompleteFile="init-complete-node$i.log"
rm -f "$initCompleteFile"
initCompleteFiles+=("$initCompleteFile")
startNode "$i" "$cmd $maybeExpectedGenesisHash"
if $addLogs; then
logs+=("$(getNodeLogFile "$i" "$cmd")")
fi
# 1 == bootstrap validator, wait until it boots before starting
# other validators
if [[ "$i" -eq 1 ]]; then
SECONDS=
waitForNodeToInit "$initCompleteFile"
(
set -x
$solana_cli --keypair config/bootstrap-validator/identity.json \
--url http://127.0.0.1:8899 genesis-hash
) | tee genesis-hash.log
maybeExpectedGenesisHash="--expected-genesis-hash $(tail -n1 genesis-hash.log)"
fi
done
waitForAllNodesToInit
}
killNode() {
declare pid=$1
set +e
if kill "$pid"; then
echo "Waiting for $pid to exit..."
wait "$pid"
echo "$pid exited with $?"
fi
set -e
}
killNodes() {
[[ ${#pids[@]} -gt 0 ]] || return
# Try to use the RPC exit API to cleanly exit the first two nodes
# (dynamic nodes, -x, are just killed)
echo "--- RPC exit"
$agave_validator --ledger "$SOLANA_CONFIG_DIR"/bootstrap-validator exit --force || true
$agave_validator --ledger "$SOLANA_CONFIG_DIR"/validator exit --force || true
# Give the nodes a splash of time to cleanly exit before killing them
sleep 2
echo "--- Killing nodes: ${pids[*]}"
for pid in "${pids[@]}"; do
killNode "$pid"
done
echo "done killing nodes"
pids=()
}
rollingNodeRestart() {
if [[ ${#logs[@]} -ne ${#nodes[@]} ]]; then
echo "^^^ +++"
echo "Error: log/nodes array length mismatch"
exit 1
fi
if [[ ${#pids[@]} -ne ${#nodes[@]} ]]; then
echo "^^^ +++"
echo "Error: pids/nodes array length mismatch"
exit 1
fi
declare oldPids=("${pids[@]}")
for i in $(seq 0 $((${#logs[@]} - 1))); do
declare pid=${oldPids[$i]}
declare cmd=${nodes[$i]}
if [[ $i -eq 0 ]]; then
# First cmd should be the faucet, don't restart it.
[[ "$cmd" = "multinode-demo/faucet.sh" ]]
pids+=("$pid")
else
echo "--- Restarting $pid: $cmd"
killNode "$pid"
# Delay 20 seconds to ensure the remaining cluster nodes will
# hit CRDS_GOSSIP_PULL_CRDS_TIMEOUT_MS (currently 15 seconds) for the
# node that was just stopped
echo "(sleeping for 20 seconds)"
sleep 20
declare initCompleteFile="init-complete-node$i.log"
rm -f "$initCompleteFile"
initCompleteFiles+=("$initCompleteFile")
startNode "$i" "$cmd"
fi
done
# 'Atomically' remove the old pids from the pids array
declare oldPidsList
oldPidsList="$(printf ":%s" "${oldPids[@]}"):"
declare newPids=("${pids[0]}") # 0 = faucet pid
for pid in "${pids[@]}"; do
[[ $oldPidsList =~ :$pid: ]] || {
newPids+=("$pid")
}
done
pids=("${newPids[@]}")
waitForAllNodesToInit
}
verifyLedger() {
for ledger in bootstrap-validator validator; do
echo "--- $ledger ledger verification"
(
set -x
$solana_ledger_tool --ledger "$SOLANA_CONFIG_DIR"/$ledger verify
) || flag_error
done
}
shutdown() {
exitcode=$?
killNodes
set +e
echo "--- Upload artifacts"
for log in "${logs[@]}"; do
upload-ci-artifact "$log"
tail "$log"
done
exit $exitcode
}
trap shutdown EXIT INT
set -e
declare iteration=1
flag_error() {
echo "Failed (iteration: $iteration/$iterations)"
echo "^^^ +++"
exit 1
}
if ! $skipSetup; then
clear_config_dir "$SOLANA_CONFIG_DIR"
multinode-demo/setup.sh --hashes-per-tick sleep
else
verifyLedger
fi
startNodes
lastTransactionCount=
while [[ $iteration -le $iterations ]]; do
echo "--- Node count ($iteration)"
(
set -x
client_keypair=/tmp/client-id.json-$$
$solana_keygen new --no-passphrase -fso $client_keypair || exit $?
$solana_gossip --allow-private-addr spy -n 127.0.0.1:8001 --num-nodes-exactly $numNodes || exit $?
rm -rf $client_keypair
) || flag_error
echo "--- RPC API: bootstrap-validator getTransactionCount ($iteration)"
(
set -x
curl --retry 5 --retry-delay 2 --retry-connrefused \
-X POST -H 'Content-Type: application/json' \
-d '{"jsonrpc":"2.0","id":1, "method":"getTransactionCount"}' \
-o log-transactionCount.txt \
http://localhost:8899
cat log-transactionCount.txt
) || flag_error
echo "--- RPC API: validator getTransactionCount ($iteration)"
(
set -x
curl --retry 5 --retry-delay 2 --retry-connrefused \
-X POST -H 'Content-Type: application/json' \
-d '{"jsonrpc":"2.0","id":1, "method":"getTransactionCount"}' \
http://localhost:18899
) || flag_error
# Verify transaction count as reported by the bootstrap-validator node is advancing
transactionCount=$(sed -e 's/{"jsonrpc":"2.0","result":\([0-9]*\),"id":1}/\1/' log-transactionCount.txt)
if [[ -n $lastTransactionCount ]]; then
echo "--- Transaction count check: $lastTransactionCount < $transactionCount"
if [[ $lastTransactionCount -ge $transactionCount ]]; then
echo "Error: Transaction count is not advancing"
echo "* lastTransactionCount: $lastTransactionCount"
echo "* transactionCount: $transactionCount"
flag_error
fi
fi
lastTransactionCount=$transactionCount
echo "--- Wallet sanity ($iteration)"
(
set -x
timeout 60s scripts/wallet-sanity.sh --url http://127.0.0.1"$walletRpcPort"
) || flag_error
iteration=$((iteration + 1))
if [[ $restartInterval != never && $((iteration % restartInterval)) -eq 0 ]]; then
if $rollingRestart; then
rollingNodeRestart
else
killNodes
verifyLedger
startNodes
fi
fi
done
killNodes
verifyLedger
echo +++
echo "Ok ($iterations iterations)"
exit 0