forked from GoogleCloudDataproc/bdutil
-
Notifications
You must be signed in to change notification settings - Fork 0
/
bdutil
executable file
·1838 lines (1627 loc) · 58.5 KB
/
bdutil
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/bin/bash
#
# Copyright 2013 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS-IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Top-level harness which deploys a ready-to-use Hadoop cluster including
# starting GCE VMs, installing Hadoop binaries, configuring HDFS, installing
# GHFS libraries, and configuring GHFS.
#
# Usage: ./bdutil [deploy|delete] [optional: <paths to env files>]...
BDUTIL_VERSION='1.3.5-SNAPSHOT'
# Prints the usage for this script and exits.
function print_usage() {
echo "bdutil version: ${BDUTIL_VERSION}"
cat <<'EOF'
Usage: ./bdutil [ optional flags ] <command> [ args ]
Description:
Utility for creating a Google Compute Engine cluster and installing, configuring, and calling
Hadoop and Hadoop-compatible software on it.
Flags:
-b, --bucket
Google Cloud Storage bucket used in deployment and by the cluster.
-D, --debug
If provided, enables high-verbosity debug logging switches for underlying
gcloud compute and gsutil calls both locally and on deployed VMs; may result
in significantly larger logfiles. Use with --verbose to also see this debug
info on the console.
-d, --use_attached_pds
If true, uses additional non-boot volumes, optionally creating them on
deploy if they don't exist already and deleting them on cluster delete.
-e, --env_var_files
Comma-separated list of bash files that are sourced to configure the cluster
and installed software. Files are sourced in order with later files being
sourced last. bdutil_env.sh is always sourced first. Flag arguments are
set after all sourced files, but before the evaluate_late_variable_bindings
method of bdutil_env.sh. see bdutil_env.sh for more information.
-F, --default_fs
Specifies the default filesystem to set, one of [gs|hdfs].
-f, --force
Assume default response (y) at prompt.
-h, --help
Print this help message.
-i, --image
Specify the Google Compute Engine image to use.
-m, --machine_type
Specify the Google Compute Engine machine type to use.
--master_boot_disk_size_gb
The size of the master boot disk.
-M, --master_machine_type
Specify the Google Compute Engine machine type for the master node.
--master_attached_pd_size_gb
Only applicable during deployment if USE_ATTACHED_PDS is true and
CREATE_ATTACHED_PDS_ON_DEPLOY is true. Specifies the size, in GB, of
the non-boot PD to create for the master node.
--master_attached_pd_type
Only applicable during deployment if USE_ATTACHED_PDS is true and
CREATE_ATTACHED_PDS_ON_DEPLOY is true. Specifies the disk type,
either 'pd-standard' or 'pd-ssd', to create for the master node.
--master_local_ssd_count
Number of local SSD devices to attach to the master node, in range [0, 4].
--network
Specify a network name with which to associate new virtual machines.
-n, --num_workers
The number of worker nodes to create.
--old_hostname_suffixes
If true, uses the old hostname convention of $PREFIX-nn and $PREFIX-dn-$i
instead of the new $PREFIX-master and $PREFIX-worker-$i. Should only be
used if necessary for interacting with older existing clusters,
as the old naming scheme is deprecated and will eventually be removed.
--preemptible
The fraction (between 0.0 and 1.0) of worker nodes to run as preemptible.
-P, --prefix
Common prefix for cluster nodes.
-p, --project
The Google Cloud Platform project to use to create the cluster.
-t, --target
Where to execute code for run_command and run_command_group.
Must be one of [master|workers|all].
-u, --upload_files
Comma-separated list of additional files to upload to VMs
-v --verbose
If provided, sends gcloud compute output to console in addition to logfiles.
--worker_attached_pds_size_gb
Only applicable during deployment if USE_ATTACHED_PDS is true and
CREATE_ATTACHED_PDS_ON_DEPLOY is true. Specifies the size, in GB, of
each non-boot PD to create for the worker nodes.
--worker_attached_pds_type
Only applicable during deployment if USE_ATTACHED_PDS is true and
CREATE_ATTACHED_PDS_ON_DEPLOY is true. Specifies the disk type,
either 'pd-standard' or 'pd-ssd', to create for the worker nodes.
--worker_local_ssd_count
Number of local SSD devices to attach to each worker node, in range [0, 4].
--worker_boot_disk_size_gb
The size of the worker boot disks.
-z, --zone
Specify the Google Compute Engine zone to use.
Commands:
create, delete, deploy, generate_config, run_command, run_command_group,
run_command_steps, shell, socksproxy
create Creates the VMs and optionally disks for the cluster.
delete Deletes the VMs and optionally disks for the cluster.
deploy Creates the VMs and optionally disks for the cluster and
then runs all COMMAND_STEPS specified in resolved
env_var_files on them.
generate_config Generates an overrides file containing the
environment-variable settings generated from flags. Takes a
single positional argument specifying the name of the file
to generate.
list_commands List the available commands (for command completion).
list_env_files List the short names for all env files in our extensions
path (for command completion).
list_options Succinctly list all command line options
(for command completion).
run_command Executes given code on nodes of a cluster. Uses --target
flag, with the default "master". Positional arguments
following run_command will be executed.
Use -- to pass flags to your command (see example).
run_command_group Executes the given command group in COMMAND_GROUPS
variable in resolved env_var_files on nodes of a cluster.
Uses --target flag, with the default "all". Takes one
positional argument of the bash command to run.
run_command_steps Runs all COMMAND_STEPS specified in resolved
env_var_files on the cluster.
shell Creates a new SSH connection to the master node.
socksproxy Create a SOCKS proxy running through the master node.
Examples:
Deploy a 5-worker cluster with prefix 'my-cluster' and BigQuery installed:
./bdutil -b foo-bucket -n 5 -P my-cluster -e bigquery_env.sh deploy
Pipe hadoop-validate-setup.sh into a bash shell on the cluster's master node:
./bdutil -P my-cluster shell < ./hadoop-validate-setup.sh
Upload and execute hadoop-validate-setup.sh on the cluster's master node as user foo-user:
./bdutil -P my-cluster -u hadoop-validate-setup.sh run_command -- \
sudo -u foo-user ./hadoop-validate-setup.sh
Configure an existing 20-worker Hadoop cluster, installing connectors as desired:
./bdutil -P pre-existing-cluster -n 20 run_command_group install_connectors
Generate an env file from flags, then deploy/delete using that file.
./bdutil -P prod-cluster1 -n 20 -b prod-bucket1 generate_config prod1_env.sh
./bdutil -e prod1_env.sh deploy
./bdutil -e prod1_env.sh delete
EOF
}
# List all commands - used for command completion.
function list_commands() {
print_usage | sed -n -e '/^Commands:/,/^$/p' | tail -n +2 | head -n -1 | \
tr -d ','
}
# List all options - used for command completion.
function list_options() {
print_usage | grep -E '^ *-' | tr -d ','
}
# Allow overriding the date function for unit testing.
function bdutil_date() {
date "$@"
}
# Simple wrapper around "echo" so that it's easy to add log messages with a
# date/time prefix.
function loginfo() {
echo "$(bdutil_date): ${@}"
}
# Simple wrapper around "echo" controllable with ${VERBOSE_MODE}.
function logdebug() {
if (( ${VERBOSE_MODE} )); then
loginfo ${@}
fi
}
# Simple wrapper to pass errors to stderr.
function logerror() {
loginfo ${@} >&2
}
# Give instructions on full usage statement and fail.
function print_help() {
logerror "For help run './bdutil --help.'"
exit 1
}
# Helper to consolidate the various error logs into a single debug file for
# easy review after an error occurs.
function consolidate_error_logs() {
if [[ -e ${GCLOUD_COMPUTE_STDOUT_FILE} ]]; then
echo '******************* gcloud compute stdout *******************' \
>> ${AGGREGATE_DEBUG_FILE}
cat ${GCLOUD_COMPUTE_STDOUT_FILE} >> ${AGGREGATE_DEBUG_FILE}
echo >> ${AGGREGATE_DEBUG_FILE}
fi
if [[ -e ${GCLOUD_COMPUTE_STDERR_FILE} ]]; then
echo '******************* gcloud compute stderr *******************' \
>> ${AGGREGATE_DEBUG_FILE}
cat ${GCLOUD_COMPUTE_STDERR_FILE} >> ${AGGREGATE_DEBUG_FILE}
echo >> ${AGGREGATE_DEBUG_FILE}
echo '************ ERROR logs from gcloud compute stderr ************' \
>> ${AGGREGATE_DEBUG_FILE}
grep -i 'error' ${GCLOUD_COMPUTE_STDERR_FILE} >> ${AGGREGATE_DEBUG_FILE}
echo >> ${AGGREGATE_DEBUG_FILE}
fi
if [[ -e ${VM_DEBUG_FILE} ]]; then
echo '******************* Exit codes and VM logs *******************' \
>> ${AGGREGATE_DEBUG_FILE}
cat ${VM_DEBUG_FILE} >> ${AGGREGATE_DEBUG_FILE}
echo >> ${AGGREGATE_DEBUG_FILE}
fi
if (( ${VERBOSE_MODE} )); then
loginfo "Verbose mode--printing full contents of details debug info"
cat ${AGGREGATE_DEBUG_FILE}
fi
logerror "Detailed debug info available in file: ${AGGREGATE_DEBUG_FILE}"
logerror 'Check console output for error messages and/or retry your command.'
}
# Handler for errors occuring during the deployment to print useful info before
# exiting. The following global variables control whether handle_error() should
# actually process and consolidate a trapped error, or otherwise simply flip
# CAUGHT_ERROR to '1' without trying to consolidate logs or exiting in case
# the caller wants to simply continue on error.
SUPPRESS_TRAPPED_ERRORS=0
CAUGHT_ERROR=0
function handle_error() {
# Save the error code responsible for the trap.
local errcode=$?
local bash_command=${BASH_COMMAND}
local lineno=${BASH_LINENO[0]}
CAUGHT_ERROR=1
if (( ${SUPPRESS_TRAPPED_ERRORS} )); then
loginfo "Continuing despite trapped error with code '${errcode}'"
return
fi
# Wait for remaining async things to finish, otherwise our error message may
# get lost among other logspam.
wait
logerror "Command failed: ${bash_command} on line ${lineno}."
logerror "Exit code of failed command: ${errcode}"
consolidate_error_logs
exit ${errcode}
}
# Needed to introduce small delays to mitigate hitting API on the same second
function sleep_for_api_ops() {
sleep "${GCLOUD_COMPUTE_SLEEP_TIME_BETWEEN_ASYNC_CALLS_SECONDS}"
}
# Helper for waiting on all async jobs to finish, with info logging. $1 should
# be a short description of what's being waiting on.
function await_async_jobs() {
trap handle_error ERR
# Sleep a tiny bit to allow the async process to report its kickoff first,
# to try to keep this "Might take a while" warning as the last message
# before the long wait.
sleep '0.5'
loginfo "Waiting on async '$1' jobs to finish. Might take a while..."
for SUBPROC in $(jobs -p); do wait ${SUBPROC}; done
# Newline since the async jobs may have printed dots for progress.
echo
}
# Given $1 describing the command to confirm (deploy|delete), prints and reads
# a confirmation prompt from the console.
SKIP_PROMPT=0
function prompt_confirmation() {
trap handle_error ERR
local msg="$1 cluster with following settings?
CONFIGBUCKET='${CONFIGBUCKET?}'
PROJECT='${PROJECT?}'
GCE_IMAGE='${GCE_IMAGE?}'
GCE_ZONE='${GCE_ZONE?}'
GCE_NETWORK='${GCE_NETWORK?}'
GCE_TAGS='${GCE_TAGS?}'
PREEMPTIBLE_FRACTION=${PREEMPTIBLE_FRACTION?}
PREFIX='${PREFIX?}'
NUM_WORKERS=${NUM_WORKERS?}
MASTER_HOSTNAME='${MASTER_HOSTNAME}'
WORKERS='${WORKERS[@]}'
BDUTIL_GCS_STAGING_DIR='${BDUTIL_GCS_STAGING_DIR}'
"
if (( ${USE_ATTACHED_PDS} )); then
msg+="\
MASTER_ATTACHED_PD='${MASTER_ATTACHED_PD}'
WORKER_ATTACHED_PDS='${WORKER_ATTACHED_PDS[@]}'
"
fi
if [[ -n "${TARGET}" ]]; then
msg+="TARGET='${TARGET}'
"
fi
if [[ -n "${COMMAND_GROUP}" ]]; then
msg+="COMMAND_GROUP='${COMMAND_GROUP}'
"
fi
msg+="(y/n) "
if (( ${SKIP_PROMPT} )); then
echo "${msg}" y
else
read -p "${msg}" PROMPT_RESPONSE
if [[ ${PROMPT_RESPONSE} != 'y' ]]; then
logerror "Aborting command '${BDUTIL_CMD}', exiting..."
exit 1
fi
fi
if [[ ${EUID} -eq 0 ]]; then
msg='Are you sure you want to run the command as root? (y/n)'
read -p "${msg}" PROMPT_RESPONSE
if [[ ${PROMPT_RESPONSE} != 'y' ]]; then
logerror "Aborting command '$1', exiting..."
exit 1
fi
fi
}
# Wrapper around gcloud compute ssh
function run_gcloud_compute_ssh() {
local hostname="$1"
# Empty command functions as regular ssh
local remote_command="$2"
local extra_args=("${@:3}")
# Ping the server every minute and allow two keepalive messages to be in
# flight at once.
extra_args+=('--ssh-flag=-oServerAliveInterval=60')
extra_args+=('--ssh-flag=-oServerAliveCountMax=3')
extra_args+=('--ssh-flag=-oConnectTimeout=30')
run_gcloud_compute_cmd ssh ${hostname} \
--command="${remote_command}" "${extra_args[@]}"
}
# The gcloud compute command with global flags and some common command flags to
# use for all GCE operations.
function run_gcloud_compute_cmd() {
local gcloud_compute_args=("$@")
local gcloud_flags=()
# Add global flags
gcloud_flags+=("--project=${PROJECT}")
gcloud_flags+=('--quiet')
gcloud_compute_args+=("--zone=${GCE_ZONE}")
if (( ${DEBUG_MODE} )); then
gcloud_flags+=('--verbosity=debug')
else
gcloud_flags+=('--verbosity=info')
fi
full_cmd=(gcloud "${gcloud_flags[@]}" compute "${gcloud_compute_args[@]}")
if (( ${RAW_MODE} )); then
loginfo "Running ${full_cmd[@]}"
"${full_cmd[@]}"
elif (( ${VERBOSE_MODE} )); then
loginfo "Running ${full_cmd[@]}"
"${full_cmd[@]}" \
2> >(tee -a ${GCLOUD_COMPUTE_STDERR_FILE} 1>&2) \
1> >(tee -a ${GCLOUD_COMPUTE_STDOUT_FILE}) \
< /dev/null
else
echo -n "."
"${full_cmd[@]}" \
2>>${GCLOUD_COMPUTE_STDERR_FILE} \
1>>${GCLOUD_COMPUTE_STDOUT_FILE} \
< /dev/null
fi
local exitcode=$?
if (( ${exitcode} != 0 )); then
if [[ "$*" =~ "--command=exit 0" ]]; then
# This is just an sshability check; only log it to debug.
logdebug "Exited ${exitcode} : ${full_cmd[@]}"
else
logerror "Exited ${exitcode} : ${full_cmd[@]}"
fi
loginfo "Exited ${exitcode} : ${full_cmd[@]}" >> ${VM_DEBUG_FILE}
else
echo -n '.'
fi
return ${exitcode}
}
# Checks for obvious issues like missing "required" fields.
function run_sanity_checks() {
trap handle_error ERR
if [[ -z "${CONFIGBUCKET}" ]]; then
logerror 'CONFIGBUCKET must be provided'
print_help
fi
# Make sure groupings of shell scripts for running on VMs are defined.
if (( ${#COMMAND_GROUPS[@]} <= 0 )); then
logerror 'COMMAND_GROUPS must be non-empty.'
print_help
fi
# Make sure the series of steps to run on VMs are defined.
if (( ${#COMMAND_STEPS[@]} <= 0 )); then
logerror 'COMMAND_STEPS must be non-empty.'
print_help
fi
# Make sure the preemptible fraction could be a fraction
if [[ ! ${PREEMPTIBLE_FRACTION} =~ ^[0-1]?(\.[0-9]+)?$ ]]; then
logerror "Preemptible fraction '${PREEMPTIBLE_FRACTION}' not a fraction."
print_help
fi
# Make sure the preemptible fraction is in range.
local lt0=$(echo | awk -v x=${PREEMPTIBLE_FRACTION} '{print (x < 0) ? 1 : 0}')
local gt1=$(echo | awk -v x=${PREEMPTIBLE_FRACTION} '{print (x > 1) ? 1 : 0}')
if [[ $lt0 -eq 1 ]]; then
logerror "Preemptible fraction '${PREEMPTIBLE_FRACTION}' is less than 0.0"
print_help
fi
if [[ $gt1 -eq 1 ]]; then
logerror "Preemptible fraction '${PREEMPTIBLE_FRACTION}' greater than 1.0"
print_help
fi
# Make sure the hostnames all abide by the PREFIX.
local node=''
for node in ${WORKERS[@]} ${MASTER_HOSTNAME?}; do
if ! [[ "${node}" =~ ^${PREFIX}.* ]]; then
logerror "Error: VM instance name ${node} doesn't start with ${PREFIX}."
print_help
fi
done
# Check for the right number of elements in WORKERS.
if (( ${#WORKERS[@]} != ${NUM_WORKERS?} )); then
logerror "WORKERS must contain ${NUM_WORKERS} elements; got ${#WORKERS[@]}"
print_help
fi
# Check for disk names being defined if USE_ATTACHED_PDS is true.
if (( ${USE_ATTACHED_PDS} )); then
if (( ${#WORKER_ATTACHED_PDS[@]} != ${NUM_WORKERS?} )); then
local actual=${#WORKER_ATTACHED_PDS[@]}
local varname='WORKER_ATTACHED_PDS'
logerror "${varname} has ${actual} elements, expected ${NUM_WORKERS}"
print_help
fi
if [[ -z "${MASTER_ATTACHED_PD}" ]]; then
logerror 'MASTER_ATTACHED_PD must be defined since USE_ATTACHED_PDS==1.'
print_help
fi
fi
# Enforce maximum local-ssds per VM.
if (( ${WORKER_LOCAL_SSD_COUNT} > 4 )); then
logerror 'WORKER_LOCAL_SSD_COUNT can be a maximum of 4.'
print_help
fi
if (( ${MASTER_LOCAL_SSD_COUNT} > 4 )); then
logerror 'MASTER_LOCAL_SSD_COUNT can be a maximum of 4.'
print_help
fi
# Make sure GCS connector is installed if it is the default FS
if [[ "${DEFAULT_FS}" == 'gs' ]] && (( ! "${INSTALL_GCS_CONNECTOR}" )); then
logerror 'INSTALL_GCS_CONNECTOR must 1 if DEFAULT_FS is gs.'
print_help
fi
# Make sure HDFS is enabled if it is the default FS
if [[ "${DEFAULT_FS}" == 'hdfs' ]] && (( ! "${ENABLE_HDFS}" )); then
logerror 'ENABLE_HDFS must 1 if DEFAULT_FS is hdfs.'
print_help
fi
# Make sure preemptible is only used with gs
local gt0=$(echo | awk -v x=${PREEMPTIBLE_FRACTION} '{print (x > 0) ? 1 : 0}')
if [[ "${DEFAULT_FS}" != 'gs' ]]; then
if [[ "$gt0" -eq 1 ]]; then
logerror 'Preemptible VMs can only be used with GCS as the DEFAULT_FS.'
print_help
fi
fi
local scheme=${HADOOP_TARBALL_URI%%://*}
# Make sure HADOOP_TARBALL_URI uses supported scheme
if [[ ! "${scheme}" =~ ^(gs|https?)$ ]] ; then
logerror "Unsupported scheme: \"$scheme\" in" \
"HADOOP_TARBALL_URI: ${HADOOP_TARBALL_URI}."
print_help
fi
# Make sure TARGET is set correctly
if [[ "${BDUTIL_CMD}" =~ ^run_command(_group)?$ ]]; then
if [[ -z "${TARGET}" ]]; then
if [[ "${BDUTIL_CMD}" == 'run_command_group' ]]; then
TARGET='all'
else
TARGET='master'
fi
logerror "TARGET unspecified assuming ${TARGET}."
elif ! [[ "${TARGET}" =~ ^(master|workers|all)$ ]]; then
logerror '--target must be [master|workers|all].'
print_help
fi
elif [[ -n "${TARGET}" ]]; then
logerror "Flag --target can only be specified for run_command" \
"or run_command_group."
print_help
fi
if [[ -n "${COMMAND_GROUP}" ]]; then
if ! grep -q ${COMMAND_GROUP}: <<< ${COMMAND_GROUPS[@]}; then
logerror "Command group '${COMMAND_GROUP}' not found in" \
"resolved COMMAND_GROUPS."
print_help
fi
fi
if [[ -z "${PROJECT}" ]]; then
loginfo 'No project provided; using default gcloud project.'
PROJECT="$(gcloud config list | grep project | cut -d'=' -f2 | tr -d ' ')"
if [[ -n "${PROJECT}" ]]; then
loginfo "Using project '${PROJECT}'"
echo "PROJECT=${PROJECT}" >> ${OVERRIDES_FILE}
else
logerror 'Cannot find project using gcloud.'
print_help
fi
fi
if [[ -z "${GCE_ZONE}" ]]; then
loginfo 'No zone provided; using default gcloud zone'
GCE_ZONE="$(gcloud config list compute/zone | grep zone | cut -d'=' -f2 | tr -d ' ')"
if [[ -n "${GCE_ZONE}" ]]; then
loginfo "Using project '${GCE_ZONE}'"
echo "GCE_ZONE=${GCE_ZONE}" >> ${OVERRIDES_FILE}
else
logerror 'Cannot find zone using gcloud.'
print_help
fi
fi
if [[ -z "${WORKER_ATTACHED_PDS_TYPE}" ]]; then
loginfo 'No WORKER_ATTACHED_PDS_TYPE provided; defaulting to pd-standard.'
WORKER_ATTACHED_PDS_TYPE='pd-standard'
echo "WORKER_ATTACHED_PDS_TYPE=${WORKER_ATTACHED_PDS_TYPE}" \
>> ${OVERRIDES_FILE}
fi
if [[ -z "${MASTER_ATTACHED_PD_TYPE}" ]]; then
loginfo 'No MASTER_ATTACHED_PD_TYPE provided; defaulting to pd-standard.'
MASTER_ATTACHED_PD_TYPE='pd-standard'
echo "MASTER_ATTACHED_PD_TYPE=${MASTER_ATTACHED_PD_TYPE}" \
>> ${OVERRIDES_FILE}
fi
if [[ -z "${GCE_MASTER_MACHINE_TYPE}" ]]; then
loginfo 'No explicit GCE_MASTER_MACHINE_TYPE provided; defaulting to' \
"value of GCE_MACHINE_TYPE: ${GCE_MACHINE_TYPE}"
GCE_MASTER_MACHINE_TYPE="${GCE_MACHINE_TYPE}"
echo "GCE_MASTER_MACHINE_TYPE=${GCE_MASTER_MACHINE_TYPE}" \
>> ${OVERRIDES_FILE}
fi
if (( MASTER_BOOT_DISK_SIZE_GB != 0 && MASTER_BOOT_DISK_SIZE_GB < 10 )); then
logerror 'MASTER_BOOT_DISK_SIZE_GB must be a minimum of 10.'
print_help
fi
if (( WORKER_BOOT_DISK_SIZE_GB != 0 && WORKER_BOOT_DISK_SIZE_GB < 10 )); then
logerror 'WORKER_BOOT_DISK_SIZE_GB must be a minimum of 10.'
print_help
fi
# TODO(dhuo): Possibly all "late variable bindings" could be generated here
# instead of actually requiring the evaluate_late_variable_bindings function.
if [[ -z "${BDUTIL_GCS_STAGING_DIR}" ]]; then
loginfo 'No staging directory got defined; computing one now.'
local staging_dir_base="gs://${CONFIGBUCKET}/bdutil-staging"
BDUTIL_GCS_STAGING_DIR="${staging_dir_base}/${MASTER_HOSTNAME}"
echo "BDUTIL_GCS_STAGING_DIR=${BDUTIL_GCS_STAGING_DIR}" >> ${OVERRIDES_FILE}
fi
# Make sure fully quallified hostnames will be 64 characters or less to avoid
# JVM issues. Assumes FQDNs are <name>.c.${PROJECT}.internal
if ! [[ "${PROJECT}" =~ [a-z] ]]; then
logerror "Warning. Interpreting \$PROJECT as a project number instead of" \
"a Project ID. Instance fully qualified domain name length validation" \
"is disabled."
else
local char_limit=$(( 64 - 12 - ${#PROJECT} + 1 )) # 12 for .c..internal
local too_long_vm_name=$(echo ${MASTER_HOSTNAME} ${WORKERS[@]} \
| grep -Eo "\S{${char_limit},}" \
| head -n 1)
if [[ -n "${too_long_vm_name}" ]]; then
local fqdn="${too_long_vm_name}.c.${PROJECT/:/.}.internal"
logerror "VM '${too_long_vm_name}' will have the ${#fqdn} character" \
"fully qualified domain name of '${fqdn}', while the JVM can only" \
"handle up to 64 characters. Please rerun with a shorter \$PREFIX."
print_help
fi
fi
}
# Checks for more heavyweight but obvious issues like CONFIGBUCKET
# inaccessibility prior to turning on any VMs.
function validate_heavyweight_settings() {
# Perform gsutil checks last, because they are slow.
loginfo "Checking for existence of gs://${CONFIGBUCKET}..."
gsutil ls -b gs://${CONFIGBUCKET}
# Catch the exitcode so that we can provide more user-friendly error messages
# while still propagating the return value out for consolidated error-trap
# handling.
local exitcode=$?
if (( ${exitcode} != 0 )); then
logerror "Failed to access bucket gs://${CONFIGBUCKET}."
logerror 'Please make sure the bucket exists and is accessible with gsutil.'
return ${exitcode}
fi
# Make sure HADOOP_TARBALL_URI exists if it st
local scheme=${HADOOP_TARBALL_URI%%://*}
if [[ "${scheme}" == 'gs' ]]; then
loginfo "Checking for existence of ${HADOOP_TARBALL_URI}..."
if (( ${VERBOSE_MODE} )); then
gsutil stat ${HADOOP_TARBALL_URI}
else
gsutil -q stat ${HADOOP_TARBALL_URI}
fi
local exitcode=$?
if (( ${exitcode} != 0 )); then
logerror "Failed to find file ${HADOOP_TARBALL_URI}."
logerror 'Please make sure it exists and is accessible with gsutil.'
return ${exitcode}
fi
fi
# Check all the specified UPLOAD_FILES.
if (( ${#UPLOAD_FILES[@]} > 0 )); then
loginfo "Checking upload files..."
for upload_file in ${UPLOAD_FILES[@]}; do
if [[ -r "${upload_file}" ]]; then
loginfo "Verified '${upload_file}'"
else
logerror "Failed to read file ${upload_file}."
logerror 'Please make sure it exists and is accessible.'
return 1
fi
done
fi
return 0
}
# In the case of a single-node setup, we expect $WORKERS and $MASTER_HOSTNAME
# to refer to the same thing, so some logic must call this function to avoid
# duplicating certain steps (e.g. instances create/delete).
function is_single_node_setup() {
if [ ${#WORKERS[@]} == 1 ] &&
[ "${WORKERS[0]}" == "${MASTER_HOSTNAME}" ]; then
true
else
false
fi
}
# Repeatedly try to ssh into node until success or limit is reached.
# Will fail if node takes too long.
function wait_for_ssh() {
trap handle_error ERR
local node=$1
local max_attempts=10
local sleep_time=${BDUTIL_POLL_INTERVAL_SECONDS}
for (( i=0; i < ${max_attempts}; i++ )); do
if run_gcloud_compute_ssh ${node} 'exit 0'; then
return 0
else
# Save the error code responsible for the trap.
local errcode=$?
loginfo "'${node}' not yet sshable (${errcode}); sleeping ${sleep_time}."
sleep ${sleep_time}
fi
done
logerror "Node '${node}' did not become ssh-able after ${max_attempts} attempts"
return ${errcode}
}
# Creates the VMs and optionally PDs of the cluster
function create_cluster() {
trap handle_error ERR
# Optionally create the disks to be attached to the VMs.
if (( ${USE_ATTACHED_PDS} && ${CREATE_ATTACHED_PDS_ON_DEPLOY} )); then
if ! is_single_node_setup; then
loginfo "Creating attached worker disks: ${WORKER_ATTACHED_PDS[@]}"
for ((i=0; i < NUM_WORKERS; i++)); do
if (( ${i} > 0 && ${i} % ${MAX_CONCURRENT_ASYNC_PROCESSES} == 0 )); then
await_async_jobs 'disks create (partial)'
loginfo 'Done. Invoking next batch...'
fi
DISK=${WORKER_ATTACHED_PDS[${i}]}
run_gcloud_compute_cmd \
disks create \
--size=${WORKER_ATTACHED_PDS_SIZE_GB} \
--type=${WORKER_ATTACHED_PDS_TYPE} \
${DISK} &
sleep_for_api_ops
done
fi
if ! (( ${SKIP_MASTER} )); then
loginfo "Creating attached master disk: ${MASTER_ATTACHED_PD}"
run_gcloud_compute_cmd \
disks create \
--size=${MASTER_ATTACHED_PD_SIZE_GB} \
--type=${MASTER_ATTACHED_PD_TYPE} \
${MASTER_ATTACHED_PD} &
loginfo 'Done creating disks!'
else
loginfo 'Skipping master-disk creation because SKIP_MASTER is true.'
fi
await_async_jobs 'disks create'
fi
# Start workers and master.
# For now, we will always auto-create a persistent boot disk and auto-delete
# it on shutdown; truly persistent volumes will be used as a non-root mount
# point. We can preserve the persistent boot disk once the setup is
# idempotent.
if ! is_single_node_setup; then
loginfo "Creating worker instances: ${WORKERS[@]}"
for ((i=0; i < NUM_WORKERS; i++)); do
if (( ${i} > 0 && ${i} % ${MAX_CONCURRENT_ASYNC_PROCESSES} == 0 )); then
await_async_jobs 'instances create (partial)'
loginfo 'Done. Invoking next batch...'
fi
local optional_disk_arg=''
if (( ${USE_ATTACHED_PDS} )); then
optional_disk_arg+="--disk name=${WORKER_ATTACHED_PDS[${i}]},mode=rw "
fi
if (( WORKER_BOOT_DISK_SIZE_GB > 0 )); then
optional_disk_arg+="--boot-disk-size=${WORKER_BOOT_DISK_SIZE_GB} "
fi
if (( ${WORKER_LOCAL_SSD_COUNT} > 0 )); then
for ((j = 0; j < WORKER_LOCAL_SSD_COUNT; j++)); do
optional_disk_arg+='--local-ssd interface=SCSI '
done
fi
local optional_preemptible_arg=""
if (( ${i} < NUM_PREEMPTIBLE )); then
optional_preemptible_arg="--preemptible"
fi
run_gcloud_compute_cmd \
instances create \
${WORKERS[${i}]} \
--machine-type=${GCE_MACHINE_TYPE} \
${optional_preemptible_arg} \
--image=${GCE_IMAGE} \
--network=${GCE_NETWORK} \
--tags=${GCE_TAGS} \
--scopes $(export IFS=,; echo "${GCE_SERVICE_ACCOUNT_SCOPES[*]}";) \
--boot-disk-type=pd-standard \
${optional_disk_arg} &
sleep_for_api_ops
done
fi
if ! (( ${SKIP_MASTER} )); then
loginfo "Creating master instance: ${MASTER_HOSTNAME}"
local optional_disk_arg=''
if (( ${USE_ATTACHED_PDS} )); then
optional_disk_arg+="--disk name=${MASTER_ATTACHED_PD},mode=rw "
fi
if (( MASTER_BOOT_DISK_SIZE_GB > 0 )); then
optional_disk_arg+="--boot-disk-size=${MASTER_BOOT_DISK_SIZE_GB} "
fi
if (( ${MASTER_LOCAL_SSD_COUNT} > 0 )); then
for ((j = 0; j < MASTER_LOCAL_SSD_COUNT; j++)); do
optional_disk_arg+='--local-ssd interface=SCSI '
done
fi
run_gcloud_compute_cmd \
instances create \
${MASTER_HOSTNAME} \
--machine-type=${GCE_MASTER_MACHINE_TYPE} \
--image=${GCE_IMAGE} \
--network=${GCE_NETWORK} \
--tags=${GCE_TAGS} \
--scopes $(export IFS=,; echo "${GCE_SERVICE_ACCOUNT_SCOPES[*]}";) \
--boot-disk-type=pd-standard \
${optional_disk_arg} &
else
loginfo 'Skipping master creation because SKIP_MASTER is true.'
fi
await_async_jobs 'instances create'
loginfo 'Instances all created. Entering polling loop to wait for ssh-ability'
# This wait is necessary due to VMs not being immediately ssh-able. It may
# still fail if a VM is particularly slow in becoming ssh-able.
for ((i=0; i < NUM_WORKERS; i++)); do
if (( ${i} > 0 && ${i} % ${MAX_CONCURRENT_ASYNC_PROCESSES} == 0 )); then
await_async_jobs 'wait_for_ssh (partial)'
loginfo 'Done. Invoking next batch...'
fi
NODE=${WORKERS[${i}]}
wait_for_ssh ${NODE} &
sleep_for_api_ops
done
if ! (( ${SKIP_MASTER} )); then
wait_for_ssh ${MASTER_HOSTNAME} &
else
loginfo 'Skipping wait_for_ssh because SKIP_MASTER is true.'
fi
# Wait for all nodes to be ready.
await_async_jobs 'wait_for_ssh'
loginfo 'Instances all ssh-able'
}
# Delete cluster and optionally attached PDs
function delete_cluster() {
# For deletion, we want to continue despite errors, but print a warning at the end.
SUPPRESS_TRAPPED_ERRORS=1
trap handle_error ERR
loginfo 'Deleting hadoop cluster...'
if ! is_single_node_setup; then
for ((i=0; i < NUM_WORKERS; i++)); do
if (( ${i} > 0 && ${i} % ${MAX_CONCURRENT_ASYNC_PROCESSES} == 0 )); then
await_async_jobs 'instances delete (partial)'
loginfo 'Done. Invoking next batch...'
fi
NODE=${WORKERS[${i}]}
run_gcloud_compute_cmd instances delete --delete-disks=boot ${NODE} &
sleep_for_api_ops
done
fi
if ! (( ${SKIP_MASTER} )); then
run_gcloud_compute_cmd instances delete \
--delete-disks=boot ${MASTER_HOSTNAME} &
else
loginfo 'Skipping master deletion because SKIP_MASTER is true.'
fi
await_async_jobs 'instances delete'
loginfo 'Done deleting VMs!'
# Optionally delete all the attached disks as well now that the instances
# have been deleted.
if (( ${USE_ATTACHED_PDS} && ${DELETE_ATTACHED_PDS_ON_DELETE} )); then
if ! is_single_node_setup; then
loginfo "Deleting attached worker disks: ${WORKER_ATTACHED_PDS[@]}"
for ((i=0; i < NUM_WORKERS; i++)); do
if (( ${i} > 0 && ${i} % ${MAX_CONCURRENT_ASYNC_PROCESSES} == 0 )); then
await_async_jobs 'disks delete (partial)'
loginfo 'Done. Invoking next batch...'
fi
DISK=${WORKER_ATTACHED_PDS[${i}]}
run_gcloud_compute_cmd disks delete ${DISK} &
sleep_for_api_ops
done
fi
if ! (( ${SKIP_MASTER} )); then
loginfo "Deleting attached master disk: ${MASTER_ATTACHED_PD}"
run_gcloud_compute_cmd disks delete ${MASTER_ATTACHED_PD} &
else
loginfo 'Skipping master-disk deletion because SKIP_MASTER is true.'
fi
await_async_jobs 'disks delete'
loginfo 'Done deleting disks!'
fi
if (( ${CAUGHT_ERROR} )); then
logerror "Warning: Some errors occurred, please review specified logfiles."
consolidate_error_logs
exit 1
fi
SUPPRESS_TRAPPED_ERRORS=0
}
# Given an env file name and a colon-separated path, look for the filename
# in the path if it is an unqualified filename. If the filename includes
# a directory part (either relative or absolute),
# or if the file is not found in the path, the filename alone is returned.
function resolve_env_file() {
local FILENAME="$1"
local EXTENSIONS_PATH="$2"
if [[ $(basename ${FILENAME}) == ${FILENAME} ]]; then
# If the filename has no directory part, we look for it in the path.
IFS=: read -a ext_path <<< "${EXTENSIONS_PATH}"
# First look for the filename as specified, then try appending "_env.sh".
for file in "${FILENAME}" "${FILENAME}_env.sh"; do
for dir in "${ext_path[@]}"; do
if [[ "{$dir}" != "" && -f "${dir}/${file}" ]]; then
echo "${dir}/${file}"
return
fi
done
done
# If the file is not in our path, fall through here and output $FILENAME
fi
echo "${FILENAME}" # Filename with directory, or not in our path.
}
function get_extension_subdirs() {
local a=( ${BDUTIL_DIR}/extensions/* ${BDUTIL_DIR}/platforms/* )
( IFS=: && echo "${a[*]}" )
}
# We first look in the current directory (.), then in any directories
# the user specified by setting BDUTIL_EXTENSIONS_PATH, then in the
# bdutil directory and its extension subdirectories.
function get_extensions_path() {
local EXTENSION_SUBDIRS=$(get_extension_subdirs)
echo ".:${BDUTIL_EXTENSIONS_PATH}:${BDUTIL_DIR}:${EXTENSION_SUBDIRS}"
}
# Resolve all of the files in ENV_FILES by calling resolve_env_file on each
# one and placing the result back into ENV_FILES.
function resolve_env_files() {
local EXTENSIONS_PATH=$(get_extensions_path)
local n=0
for file in ${ENV_FILES[@]}; do
ENV_FILES[n]=$(resolve_env_file "${file}" "${EXTENSIONS_PATH}")
n=$(( n + 1 ))
done
}