-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathstriker.conf
1399 lines (1259 loc) · 67.1 KB
/
striker.conf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
###############################################################################
# #
# Alteeve's Niche! Striker Configuration #
# #
# This is the global configuration for the Striker dashboard and ScanCore #
# alert system. Once configured on a dashboard, you can copy it as-is to your #
# Anvil! nodes. #
# #
###############################################################################
# Normally, Striker dashboards don't send alerts. If you want to send alerts,
# you can do so by "piggy-backing" on an existing mail server. Enter the domain
# name set in 'Outgoing Mail Server' in the 'Mail Servers' section.
# This is messy for legacy reasons, but will be made a lot cleaner in v3. Sorry
# about that...
#striker::email::use_server = mail.example.com
# To choose who gets alerts, and what level to use, you can enter the email
# address set as the 'Target' in one or more notification targets. To specify
# the level of alerts to receive, suffix the email address with
# '<email>:<level>'. If no level is specified, 'warning' is used. The valid
# level strings are 'debug', 'info', 'notice', 'warning' and 'critical'.
# Multiple recipients can be specified by separating them with commas.
#striker::email::notify = [email protected]:warning,[email protected]:notice
# By default, a server rebooting is treated as a notice-level alert. If you
# want to make reboots or restarts a warning level alert, set this to '1'.
#scancore::servers::reboots_are_warning = 0
### Common - These values apply to all utilities.
# If you create a custom skin, uncomment this and change the value to the name
# of the directory your skin is stored in under /var/www/html/skins/
#sys::skin = alteeve
# This controls how many seconds to wait before reloading the main page and
# status pages. Set to '0' to never refresh the page. Yes, this is because we
# suck at AJAX... Patches welcomed. ;)
#sys::reload_page_timer = 0
# When multiple database writes are needed, they will be batched to increase
# performance. However, this consumes more memory. By default, the database
# method will break up any batch of database writes larger than 25,000 calls at
# a time. If your system has less than normal memory, this can be reduced. If
# your node or dashboard has more than average memory, this can be increased to
# reduce resync time.
#sys::db::maximum_batch_size = 25000
# Periodically, we will check for an Internet connection by trying to ping a
# IP address on the Internet. The default is to ping Google's open DNS server
# at '8.8.8.8'. If you want to change this to use a different IP address, you
# can do so here.
#sys::network::internet_test_ip = 8.8.8.8
# Some commands are particularly risky, and to help avoid them being called by
# accident if a confirmed page is reloaded, a timeout is set. If the current
# time is more that this number of seconds after the request time, the command
# is aborted.
#sys::expire_timeout = 180
###############################################################################
# Install Manifests and Targets #
###############################################################################
# By default, if a node has internet access, it will be updated during the
# installation process. If you are in an environment where updates must be
# tested and wish to stick with the versions of programs on the source, set
# this to '0' to disable the install-time update attempt.
#sys::install_manifest::default::update_os = 1
# By default, the install manifest uses the standard MTU of 1500 for all
# networks. This is not (yet) configurable in the Install Manifest menu. If you
# know your network will proprely work with a higher MTU, uncomment this and
# set your desired MTU.
#
# NOTE: If the MTU is too large, The install manifest run will hang when it
# tries to start DRBD.
#
#sys::install_manifest::default::mtu_size = 1500
# In rare cases, you may wish to force DRBD into UpToDate/UpToDate state
# without an initial sync. Generally this is not a good idea because DRBD's
# backing devices won't be identical until data is written to them via DRBD. So
# a verify call will find a lot of inconsistencies should it be run. However,
# if you have a large storage pool and data is rapidly written to it, you may
# wish to enable this.
#
# In short; this calls; 'drbdadm -- --clear-bitmap new-current-uuid <res>'
# when first building the DRBD resource.
#sys::install_manifest::default::immediate-uptodate = 0
# By default, enabling the 'Install Target' feature generates a warning about
# possible DHCP conflicts when the BCN is not isolated from the IFN. If you
# know this isn't a concern, you can prevent these warnings by setting this to
# '1'.
sys::install_target::no_warning = 0
##############################################################################
# System logging #
##############################################################################
# This sets the logging of the Striker dashboard.
sys::log_level = 1
# If you want to see each processes PID in the logs, set this to '1'.
sys::log::log_pid = 0
###############################################################################
# Tools control #
###############################################################################
# This section is used to control the behaviour of various Striker and Anvil!
# tools.
# If the disaster recovery function is used, and if drive caching is enabled,
# DR jobs will look for a drive on either node as well as check for a drive on
# the DR target machine. To avoid using the wrong drive, all drives will be
# checked for a signature file. If a drive does NOT have a signature file, it
# will NOT be used for DR caching purposes. By default, this signature file is
# '.dr_cache' (not the leading '.'). If you wish to use a different signature,
# you can specify the signature here.
#tools::disaster_recovery::cache_signature = .dr_cache
# Setting this to '1' will enable 'anvil-kick-apc-ups', which is a tool that
# uses the Anvil!'s APC UPSes as a form of system-wide watchdog timer. When
# enabled, the system will tell the UPS to power off after a set amount of time
# and then turn back on after another set amount of time. During normal
# operation, the shutdown will be cancelled and the timer reset repeatedly.
# This way, the UPSes never shut off unless something anvilpens which causes all
# machines running 'anvil-kick-apc-ups' to fail to reset the timer, such as a
# switch stack failure that knocks out all networking. The goal is to force a
# hard reset of the entire rack in such a case, in the hopes that the power-
# cycle will restore normal operation.
#
# This can be enables and disabled using:
# /sbin/striker/anvil-kick-apc-ups --{enable,disable}
#
# The current state can be checked with '--status'.
#
# WARNING: Be sure to affix a warning on the UPS outlets indicating that the
# outlets may become energized automatically if this is enabled.
# Failure to warn maintenance personel may result in injury or death!
#
# NOTE: This is default-off (0) on dashboards. It will be set to '1' on nodes
# when 'sys::install_manifest::use_hap-kick-apc-ups' is set to '1' when
# an install manifest is generated. On CCRS, this is always set to '1'.
#
tools::anvil-kick-apc-ups::enabled = 0
# If you enable 'anvil-kick-apc-ups', which turns the UPSes into a rack-wide
# watchdog time, you can control the shutdown timer, restart timer and the
# frequency that the UPSes are "kicked" using these variables.
#
# This controls how far in the future to tell the UPS to shut off. The default
# is '600' (ten minutes).
#sys::apc::ups::power_off_delay = 600
#
# If the timer runs out and the UPS shuts down, this controls how long the UPS
# "sleeps" for before turning back on. The default is '300' (5 minutes).
# NOTE: Values under 300 seem to cause display issues on some SMT1500-modeul
# UPSes.
#sys::apc::ups::sleep_time = 300
#
# The kick script is started once a minute. If you want to reduce the frequency
# that the UPSes are actually kicked, set this value to how many seconds you
# want to wait between kicks. We stick with 'seconds' for consistency with
# other variables, but it will be rounded to minutes. For example, to perform
# the actual kick once every five minutes (one in five runs), set this to
# '300'.
#sys::apc::ups::kick_frequency = 300
# If you have enabled this feature, you will have an additional pair of
# commands shown in the Striker UI called "Power Cycle System" and "Power Off
# System". The former performs a "Cold Stop", then tells the UPSes to power
# off in X seconds, stay off for Y seconds, and then restore power. This is a
# useful mechanism for a total restart of a Anvil! system (and anything else
# powered by the system) for remote deployements.
#
# By default, the default shutdown delay (X) is 60 seconds and the sleep time
# (Y) is also 60 seconds. You may find that you system takes longer than that
# to shut down, or you may want to tune how long the UPSes stay off for. These
# variables allow you to control these values.
sys::apc::reboot::power_off_delay = 60
sys::apc::reboot::sleep_time = 60
# When a "Power Cycle" or "Power Off" is requested, the nodes have their
# "stop reason" set to a time stamp in the future. This tells ScanCore that the
# nodes are to be left off until the timestamp expires. This gives the system
# time to shut down the UPSes. The default is '300' (five minutes), but
# depending on your environment, this may not be long enough. You will know
# this when, during testing, one or both of the nodes start to boot before the
# UPSes lose power. In this case, increasing this delay will help.
#sys::power_off_delay = 300
# The 'anvil-safe-start' tool, when enabled, will run when a node powers on.
# It will perform various sanity checks, including trying to connect to the
# peer node. If it can reach the peer, it will join the nodes to the cluster
# stack and then boot servers on the preferred host if possible, or on the
# healthiest node when not.
#
# Set this to '1' to have the Anvil! start (or the node join) on boot. Set to
# '0' to disable this feature. If disabled, a node will not joing an Anvil!
# automatically. This can be useful when performing maintenance on a node.
#
# NOTE: Servers set to 'Do Not Start' in Striker, or that were gracefully shut
# off in striker, will stay off. Servers stopped as part of a Cold Shutdown and
# any servers running after a total loss of power or an emergency shutdown will
# start.
#tools::anvil-safe-start::enabled = 1
# During anvil-safe-start, when it first connects the replicated storage layer,
# it checks to see if one of the nodes is 'Inconsistent'. If it is, it checks
# to see how long the sync will take at the default (adaptive) sync rate. If it
# is greater than the 'max_wait_time', it will boost the resync speed and then
# wait 'resync_delay' seconds (default is 15). Then it will check to see the
# estimated timed to reach full resync.
#
# If it is less than 'max_wait_time', it will continue to wait, watching the
# resync process. Once it reaches UpToDate/UpToDate on both nodes, it will
# restore the default resync speed and proceed with the boot process.
#
# If the estimated resync time, after boosting, is higher that 'max_wait_time',
# the resync speed will be reset to default and the boot process will proceed
# with the Inconsistent node being treated as degraded (meaning servers will
# likely not boot on it). If you want to "boost and hold" so that the node does
# not appear online until it has resync'ed, set this to 'wait' and set
# 'always_boost' to '1'. This tells the system to always boost the sync speed
# on startup, regardless of the estimated time to sync on the default sync
# speed.
#
# By default, the boosted speed is 80 MB/sec and the maximum wait time is five
# minutes (300 seconds). If you wish to tune these values, you can do so here.
# Please use integers only!
#
#anvil-safe-start::drbd::always_boost = 0
#anvil-safe-start::drbd::boost_speed = 80
#anvil-safe-start::drbd::max_wait_time = wait
#anvil-safe-start::drbd::resync_delay = 15
# If you plan to use Virtual Machine Manager with a GUI install on your Striker
# dashboards, setting this option to '1' will add the Striker user's public RSA
# keys on each managed Anvil!. This will prevent the need for the user to
# repeatedly enter each node's root password when trying to connect to a
# server.
#
# NOTE: Consider the security implications of this! If this is enabled, anyone
# who has access to the dashboard will have unfettered access to the
# Anvil! nodes. Enable this feature only in environments where physical
# access is controlled.
#
tools::striker-push-ssh::enabled = 1
# If 'tools::striker-push-ssh::enabled' is enabled, it will check to see if a
# node's SSH fingerprint has changed, as would happen when a node is replaced.
# If the key has changed, it will delete the old key and record the new one,
# allowing virtual machine manager to still work properly.
#
# As above, this opens up a security risk.
#
# Turning this off will mean that you will have to manually delete old SSH
# fingerprints from the dashboard's ~/.ssh/known_hosts file and then add the
# new key with 'ssh-copy-id root@<full_node_name>' as the desktop user.
tools::auto-update-ssh-fingerprints::enabled = 1
# Set this to '1' to have Striker automatically configure Virtual Machine
# Manager when new Anvil! systems are added to Striker. Note that this has no
# effect unless 'virt-manager' is installed.
tools::striker-configure-vmm::enabled = 1
# By default, Striker will sync with its peer when its configuration changes
# via the web interface. This works by looking at the 'scancore::x::host'
# entries, matching the local machine with one entry and selecting the other
# as the peer node. The database password needs to be the same as the root
# password for this to work. To disable this automatic sync process, set this
# to '0'.
tools::striker::auto-sync = 1
### NOTE: This is quite weak security... an attacker with root access can
### disable this function and avoid the need for a passphrase entirely.
### We could add a check against the passphrase against the controller
### but that would be no better. In the end, the commands to send to the
### controller are not secured so any attempt at security here is little
### more than a delay tactic against a malicious attacker.
###
### The primary purpose of this tool is to render the data forensically
### unrecoverable by a trusted user of the system. With this option
### available, it is *critical* that you trust whomever has access to
### your Anvil!. If the system is critical enough to use this feature, it
### should really be air-gapped from any public network.
#
# This sets a minimum password length. Default is '6'.
tools::anvil-self-destruct::minimum_length = 6
# If you're Anvil! uses SafeStore and self-encrypted drives, you can enable the
# self-destruct function by setting a passphrase below.
#
# Generate the hash by running:
# - /sbin/dashboard/anvil-generate-passphrase <secret>
# Copy the value in '- Hash: [...]' and save it below.
# The default passphrase is 'secret'.
#
# **************************************
# *** CHANGE THIS BEFORE PRODUCTION! ***
# **************************************
#
tools::anvil-self-destruct::hash = vSsar3708Jvp9Szi2NWZZ02Bqp1qRCFpbcTZPdBhnWgs5WtNZKnvCXdhztmeD2cmW192CF5bDufKRpayrW/isg
##############################################################################
# ScanCore configuration variables #
##############################################################################
# This is the master switch for ScanCore. If this is set to '0', ScanCore will
# immediately exit, if run.
#
# This can be enables and disabled using:
# /sbin/striker/ScanCore/ScanCore --{enable,disable}
scancore::enabled = 1
# ScanCore always checks to see how much RAM it is using at the end of a given
# scan sweep. This isn't an exact number as it includes all shared libraries.
# This is done to catch memory leaks and suicide before it becomes a problem.
# The default is to allow 1 GiB of RAM to be used. However, on some systems
# with a very large number of sensors/scan agent data, a DB resync could cause
# the RAM use to climb higher that this. If you find that ScanCore restarts
# when a dashboard restarts, then you can increase this value. Please note that
# the value is expressed in bytes.
#scancore::maximum_ram = 1073741824
# To help protect against a scan agent hanging, we give each agent a maximum
# amount of run time before we consider it hung and terminate it. By default,
# this is 300 seconds (5 minutes). Generally you do not want to go less than
# this because, after prolonged outages of a dashboard, the resync time could
# take a while to run. If you find that agents are being terminated too often
# when they were not actually hung, increase this. Be aware that you will want
# to keep this below 'scancore::update_age_limit;.
#scancore::agent_timeout = 300
# ScanCore will periodically get started by cron (default is once per minute).
# When it starts, it checks to see if another copy is already running. If not,
# it starts. If so, it looks to see how long ago the running copy completed a
# scan. If the last scan was greated that this number of seconds ago, it
# assumes that it hung and terminates it.
scancore::update_age_limit = 1200
# By default, ScanCore will do a sweep once every minute. This means that all
# agents will be invoked once per minute, and the ScanCore will sleep for 60
# seconds. If you want to increase the scan frequency, reduce this delay. Note,
# however, that it will generate more data which, in turn, will grow the
# ScanCore database faster and increase the overall load. On the otherhand, you
# can increase the sleep time to reduce the rate at which data grows, but in
# this case, you increase the chance of missing an important event that comes
# and goes between sweeps.
#
# NOTE: If you increase this, please consider increasing
# 'scancore::power::load_shed_delay' and
# 'scancore::temperature::load_shed_delay' as well because they determine
# their time based on historical scan data, which becomes less reliable
# when the load_shed times are short and the sleep times are high.
#scancore::sleep_time = 60
# To keep ScanCore's database growth in check, an auto-archive mechanism is
# used by some agents where, at the end of each scan, the number of records in
# the history schema for a given table are counted (restricted to the agent's
# host, when appropriate).
#
# When the number exceeds the trigger, the number of records that are archived
# is approximately (number of records above trigger + 'count'). This is not an
# exact number because a representative timestamp will be selected from the
# hostory schema at this count, and then any record equal to or older than the
# time stamp is removed.
#
# To protect against the potential of using too much disk space, archives are
# off by default. Under normal behaviour, old records are simple removed. To
# enable the archive function, set this to '1'.
#scancore::archive::save_to_disk = 1
#
# When archiving to disk is enabled, to protect against large memory use or
# long archive times in the case where the number of records to archive are
# particularly large, the 'division' value is used to break up the archive job
# into "chunks". Generally speaking, the division should be greater than the
# count, and never be needed. However, if the archive process takes too long,
# or if the archive was triggered well above the trigger value, the division
# can help prevent using too much memory at once. If division is set to '0',
# archive jobs will never be divided.
#
# The archives are all stored in the specified
# directory using the name format '<agent>.<table>.<timestamp>.bz2' and the
# archives are synced between dashboards for safe keeping. Archive files are
# never removed automatically.
#
# To disable auto-archiving entirely, set 'trigger' to '0'.
#
# NOTE: If the archive directory doesn't exist, ScanCore will create it
# automatically the first time it is needed.
scancore::archive::directory = /var/ScanCore/archives/
scancore::archive::trigger = 10000
scancore::archive::count = 5000
scancore::archive::division = 50000
# Database connection variables.
#
# Hostname or IP of the Striker dashboard with the database.
#scancore::db::X::host
#
# TCP port used to connect to the postgres server on the host.
#scancore::db::X::port
#
# ScanCore database name
#scancore::db::X::name
#
# ScanCore database user
#scancore::db::X::user
#
# ScanCore database password
#scancore::db::X::password
#
# By default, Striker will try to ping the DB host before trying to connect to
# the database. If the target is unavailable, this will speed up connections in
# a degraded state. If, however, the given database host can not be pinged
# directly, set this to '0' and ping checks will be skipped.
#scancore::db::X::ping_before_connect = 0
#
# While we wait for the Spice HTML5 client to mature, Striker will
# automatically setup passwordless SSH from the admin user of Striker
# dashboards to Anvil! nodes. These peers are generally the same as the
# machines hosting ScanCore databases, so these variables server a
# double-purpose of telling one Striker dashboards who its peers are. When an
# Anvil! is added or edited, by default, the 'striker-push-ssh' and the
# 'striker-configure-vmm' tools are run on all peers. If you need to specify a
# non-standard port for SSH to a peer, you can use:
#scancore::db:X::ssh_port
#
# You can prevent this behaviour on a per-peer basis with:
#scancore::db::X::no_sync = 1
#
# You can globally disable this feature with:
#striker::peers::configure_anvils = 0
#
#scancore::db::1::host = an-striker01.alteeve.com
#scancore::db::1::port = 5432
#scancore::db::1::name = scancore
#scancore::db::1::user = admin
#scancore::db::1::password = Initial1
#scancore::db::1::ping_before_connect = 1
#
#scancore::db::2::host = an-striker02.alteeve.com
#scancore::db::2::port = 5432
#scancore::db::2::name = scancore
#scancore::db::2::user = admin
#scancore::db::2::password = Initial1
#scancore::db::2::ping_before_connect = 1
# ScanCore can trigger a shutdown of a node for two reasons; Overheating and
# insufficient remaining runtime in batteries. The former protects your nodes
# from potential damage. Both provide a mechanism for gracefully shutting down
# the hosted virtual machines gracefully and cleanly powering down the nodes
# *before* they would shut down anyway from loss of power or hardware-based
# over-temperature shutdown.
#
# Obviously, this means that ScanCore can potentially screw up and take the
# nodes offline if there was a bug. We don't pretend to be perfect.
#
# If you want to disable this automatic shutdown feature, you can do so by
# setting these variables to '1'.
#scancore::disable::power_shutdown = 0
#scancore::disable::thermal_shutdown = 0
# Normally, if one node becomes more healthy than the peer, it will migrate any
# servers on the peer to itself. If you want disable this behaviour, set this
# to '1'.
#scancore::disable::preventative_migration = 0
# Setting this to '1' will disable ScanCore's automatic booting of nodes that are
# off. When enabled, nodes will be booted provided the power and temperature is
# good and that the nodes were not cleanly stopped via Striker's WebUI.
scancore::disable::boot_nodes = 0
# By default, ScanCore running on the nodes will automatically shed load in two
# cases;
# 1. If both/all UPSes feeding it lose mains for a period of time. This is done
# to maximize battery runtime by reducing the power draw on the UPSes.
# 2. If both nodes have entered a thermal warning state for a period of time.
# This is done to reduce the amount of heat being generated to slow down the
# speed that a server room/data center heats up after losing cooling.
# This behaviour can be disabled by setting this to '1'.
#
# NOTE: Agent developers - If you update the power table, check to see if this
# is set. If it is, only update 'power' as needed. If it is NOT disabled,
# update 'power' on every pass so that ScanCore can tell when power was
# lost and honour the 'scancore::power::load_shed_delay' properly.
scancore::disable::load_shedding = 0
# When both nodes lose power, load shedding (when enabled) is delayed for a
# period of time so that load shedding doesn't occur for short, transient power
# failures. For example, shedding load when you're moving the rack to new power
# outlets is slightly overkill. This controls how long (in seconds) we need to
# lose all power before load shedding is triggered. The default is 120 (2min).
scancore::power::load_shed_delay = 300
# When both nodes enter a thermal warning state, load shedding (when enabled)
# is delayed for a period of time. The logic here is the same as with power
# load shedding delay, except the likelyhood of a transient thermal event
# impacting both nodes is less likely, so the timeout is set to 120 (2min).
scancore::temperature::load_shed_delay = 120
# When a node becomes "healthier" than it's peer, it will migrate any servers
# on the peer to itself. A node's "health" is measured by various agents
# setting health scores for a node based on certain criteria, the details of
# which are left up to the given agents.
#
# ScanCore, at the end of the scan, sums the health scores (if any) and the
# node with the lowest score is declared "healthier". If a node stays healthier
# than the other for a period of time, "preventative migration" is performed.
# The minimum amount of time that one node needs to be healthier that the other
# before migration occurs is controlled here.
#
# NOTE: This delay is important because some health scores are set via external
# devices, like UPSes. Without this delay, server migration may occur as
# soon as one node scans the external devices first, when in fact, both
# nodes are still equally healthy (or equally sick, as the case may be).
# Thus, be sure to keep this value set to a value greater than the
# 'scancore::sleep_time' time.
scancore::health::migration_delay = 120
# Setting this to an integer will enable a post-scan check of nodes (from the
# dashboard) to see if a node's cluster lock manager is hung (as could happen
# in some rare dual-failure scenarios). If set, the value will be used as a
# timout (in seconds) for the node to respond to a check of its cluster status.
# If the query doesn't return in the set number of seconds, the node will be
# considered hung and the dashboard will fence it. Setting this to '0'
# (default) will disable this feature.
#
# NOTE: This test will NOT be used if the node can't be reached, or if the HA
# stack is simply stopped. The only case where the node will be fenced
# is if the dashboard can log into the node *and* the 'ls /shared'
# command fails to return at all in the requested timeout.
#
# WARNING: This feature can not differentiate between spinlocks and normal high
# load. If you expect that a node might be placed under very high
# load, set this value high enough to account for slow responses. This
# feature is powerful in remote deployments in particular, but care
# should be taken when choosing to enable it.
#
scancore::dashboard::dlm_hung_timeout = 300
# When ScanCore or an agent wants to lock the database to do a resync or
# archive records, it will request a lock on the database. When other
# components connect to the database, they will check for a lock and if any
# requests are found, enter a wait loop. In the event of a stale lock, ScanCore
# will reap a lock if it is older that a certain age (5 minutes, by default).
# Under normal circumstances, the lock requester should automatically
# re-request their lock periodically, so that it is always less than this age.
# If you find that a lock was reaped when it shouldn't have been, you can
# increase this lock reap age here.
#scancore::locking::reap_age = 300
### Logging and Language
# ScanCore itself, plus each agent, can have customized log levels, language,
# log files. You can also customize the language used when ScanCore or an agent
# prints to STDOUT (the screen). By default, all log to '/var/log/ScanCore.log'
# in 'en_CA' (Canadian English) with log level '1', which covers important or
# warning messages only. Likewise, output to STDOUT is also in 'en_CA'.
#
# To customize, you can use the following four variables:
#
#scancore::log_file = /var/log/ScanCore.log
#scancore::log_level = 1
#scancore::log_language = en_CA
#scancore::language = en_CA
# If you are hitting database problems, you can use this option to log all
# database writes. Note that you will need to set the similar variable for each
# agent you want to monitor (or, just enable the DB logging if the agent giving
# you grief).
#scancore::log_db_transactions = 0
# The same five variables are used to configure the agents, simply substitute
# 'scancore' for the name of the agent. Additionally, a sixth option is
# available for disabling agents. For example, to customize the IPMI
# scan agent 'scan-ipmitool', you can use:
#
#scan-ipmitool::log_file = /var/log/ScanCore.log
#scan-ipmitool::log_level = 2
#scan-ipmitool::log_language = en_CA
#scan-ipmitool::language = en_CA
#scan-ipmitool::log_db_transactions = 1
#scan-ipmitool::disable = 1
#
# NOTE: The language you choose MUST exist in the XML "strings" files!
#
# ScanCore itself:
#scancore::log_file = /var/log/ScanCore.log
#scancore::log_level = 2
#scancore::log_language = en_CA
#scancore::language = en_CA
#scancore::log_db_transactions = 1
# APC/Schneider brand SNMP switched PDU scan agent:
#scan-apc-pdu::log_file = /var/log/ScanCore.log
#scan-apc-pdu::log_level = 2
#scan-apc-pdu::log_language = en_CA
#scan-apc-pdu::language = en_CA
#scan-apc-pdu::log_db_transactions = 1
#scan-apc-pdu::disable = 1
# APC/Schneider brand SNMP UPS scan agent:
#scan-apc-ups::log_file = /var/log/ScanCore.log
#scan-apc-ups::log_level = 2
#scan-apc-ups::log_language = en_CA
#scan-apc-ups::language = en_CA
#scan-apc-ups::log_db_transactions = 1
#scan-apc-ups::disable = 1
# Linux bond driver scan agent:
#scan-bond::log_file = /var/log/ScanCore.log
#scan-bond::log_level = 2
#scan-bond::log_language = en_CA
#scan-bond::language = en_CA
#scan-bond::log_db_transactions = 1
#scan-bond::disable = 1
# RHEL's Resource Group manager's 'clustat' scan agent:
#scan-clustat::log_file = /var/log/ScanCore.log
#scan-clustat::log_level = 2
#scan-clustat::log_language = en_CA
#scan-clustat::language = en_CA
#scan-clustat::log_db_transactions = 1
#scan-clustat::disable = 1
### See HPACUCLI notes below!
# HP RAID controller scan agent:
#scan-hpacucli::log_file = /var/log/ScanCore.log
#scan-hpacucli::log_level = 2
#scan-hpacucli::log_language = en_CA
#scan-hpacucli::language = en_CA
#scan-hpacucli::log_db_transactions = 1
#scan-hpacucli::disable = 1
# IPMI scan agent:
#scan-ipmitool::log_file = /var/log/ScanCore.log
#scan-ipmitool::log_level = 2
#scan-ipmitool::log_language = en_CA
#scan-ipmitool::language = en_CA
#scan-ipmitool::log_db_transactions = 1
#scan-ipmitool::disable = 1
# KVM/QEMU virtual machine scan agent:
#scan-server::log_file = /var/log/ScanCore.log
#scan-server::log_level = 2
#scan-server::log_language = en_CA
#scan-server::language = en_CA
#scan-server::log_db_transactions = 1
#scan-server::disable = 1
# LSI/Avago RAID controller scan agent:
#scan-storcli::log_file = /var/log/ScanCore.log
#scan-storcli::log_level = 2
#scan-storcli::log_language = en_CA
#scan-storcli::language = en_CA
#scan-storcli::log_db_transactions = 1
#scan-storcli::disable = 1
# HPACUCLI Notes;
#
# 1. The 'HP Array Configuration Utility CLI' (hpacucli) can not be shipped
# with or automatically installed by the Anvil! system. You must download
# the hpacucli RPM manually and install it on each HP Proliant-based node.
# At the time of writing, this is the most recent version;
# - https://support.hpe.com/hpsc/swd/public/detail?swItemId=MTX_d6ebba0f5cd642edace4648b9a
#
# 2. The HP controllers will report each drive's maximum temperature. This is
# usually 40°C, close to the nominal operating temperature in many
# environments. If you find that the drives are repeatedly in a warning or
# critical temperature state, you can set the below variable to '1' and the
# reported maximum temperature will be ignored.
#scan-hpacucli::ignore_maximum_temperature = 0
#
# This will set the thresholds to the following (in °C, uncomment to
# change):
#scan-hpacucli::thresholds::drives::high_warning = 50
#scan-hpacucli::thresholds::drives::high_critical = 55
#scan-hpacucli::thresholds::drives::low_warning = 5
#scan-hpacucli::thresholds::drives::low_critical = 0
#scan-hpacucli::thresholds::drives::jump = 3
#scan-hpacucli::thresholds::drives::buffer = 2
###############################
# Power Shutdown Variables #
###############################
# ScanCore will power down an Anvil! when the UPSes powering it lose input
# power and drain below a minimum estimated hold up time. When the power is
# restored, the nodes will be booted once one of the UPSes has a minimum
# charge percentage.
#
# By default, the minimum hold-up time when running on batteries is set to
# 10 minutes (600 seconds). The default minimum charge percentage to boot back
# up is 45%.
#
# You may want to adjust this to better suit your needs, however.
#
# Your minimum hold-up time should be the time it takes to perform a
# "Cold Shut Down" from Striker, with all of your servers running, plus a
# little extra time as a buffer. This will ensure that your Anvil! will safely
# power down before the UPSes completely deplete.
#
# The minimum charge percentage should be set to a high enough number to handle
# losing power during the boot process. To know this percentage depends a lot
# on your environment. To calculate it, look at the load on your UPSes under
# normal operation (as a number of watts). Then consult your UPS's "Runtime" or
# "Hold-Up Time" chart. This will tell you how many minutes your UPS will run
# given your load and a full charge. Divide the time you need to fully boot and
# then shut down your Anvil! by the runtime at full charge for your load.
# Multiply the result by 100 to get your minimum charge percentage.
#
# For example;
#
# Assume you need 15 minutes at worst from the time you start to boot your
# Anvil! until you can shut it back down. Also assume you have an average 400w
# load on an APC SmartUPS 1500 UPS.
#
# Looking at the "Batteries & Runtime" chart here:
# http://www.apc.com/resource/include/techspec_index.cfm?base_sku=SMT1500RM2U
# We see that a full charge can hold up a 400w load for 37 minutes.
#
# Knowing this, we have: ((15 % 37) x 100) == 40.5%. We'll pad this to 45%
#
# To configure this manually, uncomment the lines below. The
# 'scancore::minimum_ups_runtime' is the number of seconds and
# 'scancore::minimum_safe_charge' is the percentage (without the % sign).
#
#scancore::minimum_ups_runtime = 600
#scancore::minimum_safe_charge = 45
###############################
# Thermal Shutdown Variables #
###############################
# Each sensor has a default "weight" of '1'. When ScanCore finishes a cycle,
# it will look at the number of sensors that have gone above their high
# critical threshold, or below their low critical threshold, and add up their
# weights. If the total value is equal to or greater than this shutdown limit,
# ScanCore will initiate a withdrawal and power down of the node.
#scancore::temperature::shutdown_limit = 5
# When a node goes into thermal shutdown, many of its temperature (and other)
# sensors go offline. So a dashboard has a limited ability to determine if it
# is safe to boot a node back up.
#
# To account for this, when a node is in a thermal emergency stop, the
# dashboard will check its own temperature (if it has temp sensors) and if it
# is OK, it will read the node's 'Ambient' and 'Systemboard' (or the sensors
# defined by you in 'scan-ipmitool::offline_sensor_list' (or similar from other
# scan agents).
#
# If those sensors on the target node are OK, then the last step is to count
# how many times in the last six hours the node went into thermal shutdown. The
# more times it has gone into thermal shutdown, the longer it waits before
# booting the node. By default, the delays are:
#
# Reboots | Wait X seconds until boot
# --------+--------------------------
# 1 | 600 (10 minutes)
# 2 | 1800 (30 minutes)
# 3 | 3600 (1 hour)
# 4 | 7200 (2 hours)
# >4 | 21600 (6 hours)
# --------+--------------------------
#
# The goal here is to minimize the risk of damage to the hardware. ScanCore
# tries to shutdown a node before it goes into hardware-thermal shutdown and
# before damage can occur, but it is not perfect. It is possible that thermal
# shutdown is disabled in the BIOS and that the graceful shutdown of the node
# could take too long and damage could occur. This increasing delay is
# designed to help minimize the risk of a chronic failure in the node causing
# the node to repeatedly be put at risk.
#
# The trade off, of course, is increased downtime. Particularly if both nodes
# went into shutdown. So it is possible for you to override the default timing
# using the variables below.
#scancore::thermal_reboot_delay::1 = 600
#scancore::thermal_reboot_delay::2 = 1800
#scancore::thermal_reboot_delay::3 = 3600
#scancore::thermal_reboot_delay::4 = 7200
#scancore::thermal_reboot_delay::more = 21600
##############################################################################
# ScanCore - Scan Agent Overrides #
##############################################################################
### scan-ipmitool
# Where possible, the upper and lower temperature sensor values are pulled from
# the IPMI sensor data itself. Some sensor values do not specify their upper
# and lower sensor values, however. In this case, scan-ipmitool uses the
# following default limits (all in °C!):
# high_warning = 50
# high_critical = 55
# low_warning = 5
# low_critical = 0
#
# To catch sudden increases in temperature, usually indicative of an AC
# failure, we set a "jump" of 'delta' value. If the temperature of a sensor
# increases by more than this number of degrees celsius, a "sudden change"
# alert will be generated.
# The default 'jump' value is 5°C between two scans, typically 30 seconds or
# so.
#
# To avoid repeated alerts when a thermal sensor is hovering around a
# threshold, a buffer is used. For an alert to clear, a sensor must drop or
# rise 2*C below or above and high or low threshold, respectively. For example,
# if a sensor goes into warning at 50*C, it must drop below 48*C for the alert
# to be cleared.
#
# If you want to use a different set of defaults, you can do so by uncommenting
# and editing the following variables:
#scan-ipmitool::thresholds::default::high_warning = 50
#scan-ipmitool::thresholds::default::high_critical = 55
#scan-ipmitool::thresholds::default::low_warning = 5
#scan-ipmitool::thresholds::default::low_critical = 0
#scan-ipmitool::thresholds::default::jump = 5
#scan-ipmitool::thresholds::default::buffer = 2
# To override the thresholds of a single sensor, you can do so using the sensor
# name reported by 'ipmitool'. To get a list of the sensors and their names,
# you can use the command (from a linux machine):
#
# ipmitool -H an-a05n01.ipmi -U admin sensor list all
#
# Replace 'an-a05n01.ipmi' with the hostname or IP of your IPMI device.
# Replace 'admin' with the IPMI user name for your IPMI device.
# Enter the IPMI user's password when prompted.
#
# NOTE: Be sure to match the sensor name exactly!
#
# As an example, if you wanted to manually adjust the threshold 'Ambient'
# threshold, you can use the following;
#scan-ipmitool::thresholds::Ambient::high_warning = 50
#scan-ipmitool::thresholds::Ambient::high_critical = 55
#scan-ipmitool::thresholds::Ambient::low_warning = 5
#scan-ipmitool::thresholds::Ambient::low_critical = 0
#scan-ipmitool::thresholds::Ambient::jump = 5
#scan-ipmitool::thresholds::Ambient::buffer = 2
#
# You can adjust the weight of a given sensor (up or down) using 'weight',
# expressed as a whole or real number.
#scan-ipmitool::thresholds::Ambient::weight = 1.5
# When a node is shut down because the temperature got too high or low, most
# of the thermal sensors stop reporting data. In most cases, though, the
# 'Ambient' and 'Systemboard' remain readable. By default, the dashboard will
# check its own temperature and, if that is OK, check these sensors on the
# target node to determine if the node is safe to boot back up. If your nodes
# use different names, or if you have access to additional sensors, you can
# change the list of sensors checked by specifying them here as a
# comma-separated list.
#
# NOTE! The sensor names are case-sensitive and must match exactly to the
# output show in ipmitool!
#
# To determine which sensor values are available when your nodes are off, you
# can run the following command from the Striker dashboard:
#
# ipmitool -H <node_ipmi_ip> -U <ipmi_user> sensor list all | grep 'degrees C'
#
#====
#Ambient | 25.000 | degrees C | ok | na | 1.000 | 6.000 | 37.000 | 42.000 | na
#Systemboard | 35.000 | degrees C | ok | na | na | na | 65.000 | 70.000 | na
#====
# If you have different hardware nodes, and the sensors differ between them,
# list all of the sensors here. Once that are not found will be ignored.
#scan-ipmitool::offline_sensor_list = Ambient,Systemboard
# Most Striker Dashboards run commodity hardware and, thus, do not have IPMI.
# When a dashboard does have IPMI, it can make a much more informed decision
# about whether it is safe to boot a node that has gone into thermal shutdown
# because it will be able to check all of its own sensors. If any are in a
# 'warning' state, it will not boot the nodes. This is excellent when dealing
# with a data center or room that has lost cooling.
#
# If your dashboards do have IPMI, you can tell scan-ipmitool about how to
# access it using:
#
#scan-ipmitool::machine::<hostname>::power_check_command = <fence_ipmilan call>
#
# The 'fence_ipmilan' command is the command used by Striker dashboards to
# check the state of and to control the nodes. Generally, the command looks
# like this:
#
#fence_ipmilan -a <ipmi_ip> -l <ipmi_user> -p <password> -o status
#
# Try this call out on the dashboard to see if you can get the power state of
# the machine. It should return:
#
#Status: ON
#
# If it doesn't, please read 'man fence_ipmilan' to see what other switches you
# might need to be able to read it.
#
# Once you can read the status, remove the '-o status' and use the rest of the
# command as the value for this variable. Here is an example for the Striker
# dashboard called 'an-striker01.alteeve.com':
#
#scan-ipmitool::machine::an-striker01.alteeve.com::power_check_command = fence_ipmilan -a an-striker01.ipmi -l admin -p Initial1
###############################################################################
# Striker USB management #
###############################################################################
### Overview
#
# This control how USB mass storage devices will be managed when plugged into
# given physical USB ports. It provides a mechanism for mounting a USB drive on
# a server running linux as though the USB drive had been plugged into the
# server directly, from the user's perspective.
#
# It works by mounting a USB storage locally, then connecting to the target
# server and uses nfs to mount it on the target. In this way, the drive will
# appear automatically on the user's desktop (or at a defined mount point)
# automatically, and clean itself up automatically, when a USB drive is
# inserted or removed from the host.
#
# It also supports LUKS encrypted USB drives, including the ability to setup
# encryption on the USB drive if it is not yet encrypted, for you. This does
# require storing the USB drive's passphrase on this machine, so caution is
# required when used this way.
### Variables
#
# This is set to '1' when you want to enable this feature. When using
# auto-encryption, be careful to only enable this when there is low risk of a
# user accidentally inserting a USB drive they don't want reformated!
remote-usb::enable_remote_usb_mount = 0
# When it is time to mount the local USB drive's mount point on a remote
# machine, we need to pass credentials to the remote machine in order for it
# to connect to this machine.
#
# The host name entered below must be resolvable on the target server. If in
# doubt, use this machine's IP address.
#
# NOTE: If you use a host name, make sure the remote machine can resolve it to
# an IP address!
#
# The local 'mount' will have the USB device address appended to it. If the
# mounted filesystem has a label, that will be appended after the USB address
# as well.
#
# The 'host' can be the special replacement variable '#!short_hostname!#' which
# Striker will translate into the current dashboard's host name. This way, the
# same/sync'ed striker.conf will work on both nodes.
#
# NOTE: when using multiple options with 'export_options', do NOT use multiple
# '-o X -o Y' calls! Use '-o x,y' or else the NFS mount will be
# read-only.
#
#remote-usb::local::host = #!short_hostname!#
#remote-usb::local::user = root
#remote-usb::local::password = secret
#remote-usb::local::mount = /mnt/remote
#remote-usb::local::export_options = -i -o rw,sync,no_root_squash
# This configured which remote host to mount the USB drive on. Note that
# because of how sshfs works, only the user set below will be able to access
# the mount point.
#
# NOTE: Be sure that the local machine can resolve the 'host' name to an IP.
# As above, it is usually safest to use an IP address directly.
#
# The remote 'mount' will have the same appended suffix as the local mount.
#
#remote-usb::remote::host = 10.255.6.1
#remote-usb::remote::user = root
#remote-usb::remote::password = secret
#remote-usb::remote::mount = /mnt/remote
#remote-usb::remote::mount_options = -t nfs -o sync
# It is possible to decrypt LUKS-encrypted partition, provided the LUKS key is
# provided. At this time, only one global LUKS key is supported. Enter it in
# the following variable.
#remote-usb::luks::passphrase = supersecret
# If the partition is not 'ext4' or if it is not encrypted, setting this to '1'
# will cause the drive to be reformatted and encrypted without prompt. This
# should only be enabled in very specific circumstances. If used, be sure to
# warn users that their drives will be reformatted if they plug them into a
# managed port and they are not encrupted!
#remote-usb::luks::force_initialize = 1