-
Notifications
You must be signed in to change notification settings - Fork 29
/
kvm_rolling_reboot.py
executable file
·436 lines (374 loc) · 17.1 KB
/
kvm_rolling_reboot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
#!/usr/bin/env python
# Copyright 2016, Schuberg Philis BV
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# Patch KVM hypervisors that are registered in Cosmic
# Remi Bergsma, [email protected]
# We depend on these modules
import sys
import time
from datetime import datetime
import os
import getopt
import getpass
from cloudstackops import cloudstackops
from cloudstackops import kvm
# Fabric
from fabric.api import *
from fabric import *
from fabric.network import disconnect_all
# Handle arguments passed
def handleArguments(argv):
global DEBUG
DEBUG = 0
global DRYRUN
DRYRUN = 1
global PREPARE
PREPARE = 0
global clustername
clustername = ''
global configProfileName
configProfileName = ''
global ignoreHostList
ignoreHostList = ""
global ignoreHosts
ignoreHosts = ''
global onlyHosts
onlyHosts = ''
global onlyHostList
onlyHostList = ""
global skip_os_version
skip_os_version = ""
global threads
threads = 5
global halt_hypervisor
halt_hypervisor = False
global force_reset_hypervisor
force_reset_hypervisor = False
global skip_reboot_hypervisor
skip_reboot_hypervisor = False
global firmware_reboot_hypervisor
firmware_reboot_hypervisor = False
global pre_empty_script
pre_empty_script = 'kvm_pre_empty_script.sh'
global post_empty_script
post_empty_script = 'kvm_post_empty_script.sh'
global post_reboot_script
post_reboot_script = 'kvm_post_reboot_script.sh'
global checkBonds
checkBonds = True
# Usage message
help = "Usage: ./" + os.path.basename(__file__) + ' [options]' + \
'\n --config-profile -c <profilename>\t\tSpecify the CloudMonkey profile name to ' \
'get the credentials from (or specify in ./config file)' + \
'\n --clustername -n <clustername> \t\tName of the cluster to work with' + \
'\n --ignore-hosts <list>\t\t\t\tSkip work on the specified hosts (for example if you need to resume): ' \
'Example: --ignore-hosts="host1, host2" ' + \
'\n --only-hosts <list>\t\t\t\tOnly execute work on the specified hosts (for example if you need to resume): ' \
'Example: --only-hosts="host1, host2" ' + \
'\n --skip-os-version <version>\t\t\tSkip all hosts that match OS version (Example: CentOS 7.5.1804)' + \
'\n --threads <nr>\t\t\t\tUse this number or concurrent migration threads ' + \
'\n --halt\t\t\t\t\tInstead of the default reboot, halt the hypervisor (useful in case of hardware ' \
'upgrades) ' + \
'\n --force-reset-hypervisor\t\t\tInstead of the default reboot, force-reset the hypervisor (useful in ' \
'situations where a normal reboot would hang. It will sync filesystems first.) ' + \
'\n --skip-reboot-hypervisor\t\t\tInstead of the default reboot, skip the hypervisor reboot (useful in ' \
'situations where you would only want to live migrate virtual machines in the cluster.) ' + \
'\n --upgrade-firmware-reboot\t\t\tInstead of the default reboot, upgrade the HP firmware and reboot the hypervisor' \
'\n --pre-empty-script\t\t\t\tBash script to run on hypervisor before starting the live migrations to empty ' \
'hypervisor (expected in same folder as this script)' + \
'\n --post-empty-script\t\t\t\tBash script to run on hypervisor after a hypervisor has no more VMs running' \
'\n --post-reboot-script\t\t\t\tBash script to run on hypervisor after a hypervisor has been rebooted' \
'\n --no-bond-check\t\t\t\tSkip the bond check' + \
'\n --debug\t\t\t\t\tEnable debug mode' + \
'\n --exec\t\t\t\t\tExecute for real' + \
'\n --prepare\t\t\t\t\tExecute some prepare commands'
try:
opts, args = getopt.getopt(
argv, "hc:n:t:p", [
"credentials-file=", "clustername=", "ignore-hosts=", "only-hosts=", "threads=", "pre-empty-script=",
"post-empty-script=", "force-reset-hypervisor", "skip-reboot-hypervisor", "upgrade-firmware-reboot",
"no-bond-check", "skip-os-version=", "halt", "debug", "exec",
"post-reboot-script=", "prepare"])
except getopt.GetoptError as e:
print "Error: " + str(e)
print help
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
print help
sys.exit()
elif opt in ("-c", "--config-profile"):
configProfileName = arg
elif opt in ("-n", "--clustername"):
clustername = arg
elif opt in ("-t", "--threads"):
threads = arg
elif opt in ("--ignore-hosts"):
ignoreHostList = arg
elif opt in ("--only-hosts"):
onlyHostList = arg
elif opt in ("--halt"):
halt_hypervisor = True
elif opt in ("--force-reset-hypervisor"):
force_reset_hypervisor = True
elif opt in ("--skip-reboot-hypervisor"):
skip_reboot_hypervisor = True
elif opt in ("--upgrade-firmware-reboot"):
firmware_reboot_hypervisor = True
elif opt in ("--pre-empty-script"):
pre_empty_script = arg
elif opt in ("--post-empty-script"):
post_empty_script = arg
elif opt in ("--post-reboot-script"):
post_reboot_script = arg
elif opt in ("--skip-os-version"):
skip_os_version = arg
elif opt in ("--debug"):
DEBUG = 1
elif opt in ("--exec"):
DRYRUN = 0
elif opt in ("--prepare"):
PREPARE = 1
elif opt in ("--no-bond-check"):
checkBonds = False
# Default to cloudmonkey default config file
if len(configProfileName) == 0:
configProfileName = "config"
# Ignore host list
if len(ignoreHostList) > 0:
ignoreHosts = ignoreHostList.replace(' ', '').split(",")
else:
ignoreHosts = []
# Only host list
if len(onlyHostList) > 0:
onlyHosts = onlyHostList.replace(' ', '').split(",")
else:
onlyHosts = []
# We need at least a cluster name
if len(clustername) == 0:
print help
sys.exit(1)
if __name__ == '__main__':
handleArguments(sys.argv[1:])
# Init CloudStack class
c = cloudstackops.CloudStackOps(DEBUG, DRYRUN)
c.task = "KVM Rolling Reboot"
c.slack_custom_title = "Hypervisor"
c.slack_custom_value = ""
c.instance_name = "N/A"
c.vm_name = "N/A"
c.cluster = clustername
# Init XenServer class
k = kvm.Kvm(ssh_user=getpass.getuser(), threads=threads, pre_empty_script=pre_empty_script,
post_empty_script=post_empty_script, post_reboot_script=post_reboot_script)
k.DRYRUN = DRYRUN
k.PREPARE = PREPARE
c.kvm = k
# make credentials file known to our class
c.configProfileName = configProfileName
# Init the CloudStack API
c.initCloudStackAPI()
# Poolmaster
poolmaster = "n/a"
if DEBUG == 1:
print "API address: " + c.apiurl
print "ApiKey: " + c.apikey
print "SecretKey: " + c.secretkey
# Check cloudstack IDs
if DEBUG == 1:
print "Note: Checking IDs of provided input.."
clusterID = c.checkCloudStackName(
{'csname': clustername, 'csApiCall': 'listClusters'})
if clusterID == 1:
print "Error: Could not find cluster '" + clustername + "'."
disconnect_all()
sys.exit(1)
# Get cluster hosts
cluster_hosts = sorted(c.getAllHostsFromCluster(clusterID), key=lambda h: h.name)
# Print cluster info
print "Note: Gathering some info about cluster '" + clustername + "':"
c.printCluster(clusterID, "KVM")
# Put the scripts we need
for host in cluster_hosts:
k.put_scripts(host)
# Print cluster info
print "Note: Gathering some info about hypervisors in cluster '" + clustername + "':"
c.printHypervisors(clusterID, False, checkBonds, "KVM")
to_slack = True
if DRYRUN == 1:
to_slack = False
if halt_hypervisor:
message = "Instead of reboot, we will halt the hypervisors. You need to start it yourself for the script to" \
" continue moving to the next hypervisor."
c.print_message(message=message, message_type="Warning", to_slack=to_slack)
if force_reset_hypervisor:
message = "Instead of reboot, we will force-reset the hypervisors!"
c.print_message(message=message, message_type="Warning", to_slack=to_slack)
if skip_reboot_hypervisor:
message = "Skipping the reboot of the hypervisors!"
c.print_message(message=message, message_type="Warning", to_slack=to_slack)
if firmware_reboot_hypervisor:
message = "Upgrading HP firmware and rebooting the hypervisor!"
c.print_message(message=message, message_type="Warning", to_slack=to_slack)
if DRYRUN == 1:
print
print "Warning: We are running in DRYRUN mode."
print
print "This script will: "
print " - For any hypervisor it will do this:"
print " - execute the --pre-empty-script script '" + pre_empty_script + "' on the hypervisor"
print " - disable the host in Cosmic"
print " - live migrate all VMs off of it"
print " - execute the --post-empty-script script '" + post_empty_script + "' on the hypervisor"
print " - when empty, it will reboot the hypervisor"
print " (halting is " + str(halt_hypervisor) + ") and (force_reset is " + str(force_reset_hypervisor) + ") and (skip_reboot is " + str(skip_reboot_hypervisor) + ") and (firmware_reboot is " + str(firmware_reboot_hypervisor) + ")"
print " - will wait for it to come back online (checks SSH connection)"
print " - execute the --post-reboot-script script '" + post_reboot_script + "' on the hypervisor"
print " - enable the host in Cosmic"
print " - waits until host is Connected & Up in Cosmic"
print " - continues to the next hypervisor"
print "Then the reboot cyclus for " + clustername + " is done!"
print
# Hosts to ignore
if len(ignoreHosts) > 0:
print "Note: Ignoring these hosts: " + str(ignoreHosts)
if len(onlyHosts) > 0:
print "Note: Only processing these hosts: " + str(onlyHosts)
if len(skip_os_version) > 0:
print "Note: Skipping hosts with OS: " + str(skip_os_version)
print "To kick it off, run with the --exec flag."
print
disconnect_all()
sys.exit(1)
# Start time
print "Note: Starting @ %s" % time.strftime("%Y-%m-%d %H:%M")
start_time = datetime.now()
# Then the other hypervisors, one-by-one
for host in cluster_hosts:
c.slack_custom_value = host.name
c.zone_name = host.zonename
if host.name in ignoreHosts:
message = "Skipping %s due to --ignore-hosts setting" % host.name
c.print_message(message=message, message_type="Warning", to_slack=False)
continue
if len(onlyHosts) > 0 and host.name not in onlyHosts:
message = "Skipping %s due to --only-hosts setting" % host.name
c.print_message(message=message, message_type="Warning", to_slack=False)
continue
if len(skip_os_version) > 0 and skip_os_version in host.hypervisorversion:
message = "Skipping %s with OS '%s' due to --skip-os-version set to '%s'" % (host.name, host.hypervisorversion, skip_os_version)
c.print_message(message=message, message_type="Warning", to_slack=False)
continue
# Execute pre-empty-script
message = "Processing host %s" % host.name
c.print_message(message=message, message_type="Note", to_slack=True)
if k.exec_script_on_hypervisor(host, pre_empty_script) is False:
message = "Executing script '%s' on host %s failed." % (pre_empty_script, host.name)
c.print_message(message=message, message_type="Error", to_slack=True)
sys.exit(1)
# Start with disabling the host
if host.resourcestate != "Disabled":
# Disable host to prevent new VMs landing on it
if not c.updateHost({'hostid': host.id, 'allocationstate': "Disable"}):
message = "Disabling host %s failed! Please investigate.." \
% (host.name)
c.print_message(message=message, message_type="Warning", to_slack=True)
message = "Waiting for host %s to reach Disabled state" % host.name
c.print_message(message=message, message_type="Note", to_slack=True)
while True:
hostData = c.getHostData({'hostid': host.id})
if hostData[0].resourcestate == "Disabled":
break
sys.stdout.write(".")
sys.stdout.flush()
time.sleep(5)
message = "Host %s reached Disabled state" % host.name
c.print_message(message=message, message_type="Note", to_slack=True)
running_vms = k.host_get_vms(host)
message = "Found %s VMs running on host %s. Will now start migrating them to other hosts in the same cluster" % (running_vms, host.name)
c.print_message(message=message, message_type="Note", to_slack=True)
# Migrate all vm's and empty hypervisor
retries = 0
while not c.emptyHypervisor(host.id):
to_slack = False
if retries == 0:
to_slack = True
retries += 1
message = "Emptying hypervisor %s failed, retrying.." % host.name
c.print_message(message=message, message_type="Warning", to_slack=to_slack)
time.sleep(10)
message = "Emptying hypervisor %s succeeded." % host.name
c.print_message(message=message, message_type="Note", to_slack=True)
# Reboot host
message = "Will execute post_empty scripts, then reboot hypervisor %s" % host.name
if halt_hypervisor:
message = "About to halt hypervisor %s. Be sure to start it manually for the procedure to continue!" % host.name
if force_reset_hypervisor:
message = "About to force-reset hypervisor %s" % host.name
if skip_reboot_hypervisor:
message = "About to skip reboot of hypervisor %s" % host.name
if firmware_reboot_hypervisor:
message = "About to upgrade HP firmware and reboot hypervisor %s" % host.name
c.print_message(message=message, message_type="Note", to_slack=True)
if not k.host_reboot(host, halt_hypervisor=halt_hypervisor, force_reset_hypervisor=force_reset_hypervisor, skip_reboot_hypervisor=skip_reboot_hypervisor, firmware_reboot_hypervisor=firmware_reboot_hypervisor):
message = "Reboot/Halt/Force-reset failed for %s. Please investigate.." % host.name
c.print_message(message=message, message_type="Error", to_slack=True)
sys.exit(1)
message = "Reboot/Halt/Force-reset/Skip-reboot was successful for %s." % host.name
c.print_message(message=message, message_type="Note", to_slack=True)
# Enable host
if not c.updateHost({'hostid': host.id, 'allocationstate': "Enable"}):
message = "Enabling host %s failed! Please investigate.." \
% (host.name)
c.print_message(message=message, message_type="Warning", to_slack=True)
# Wait until agent is connected
message = "Waiting for %s to connect to Cosmic.." % host.name
c.print_message(message=message, message_type="Note", to_slack=True)
while True:
hostData = c.getHostData({'hostid': host.id})
if hostData[0].resourcestate == "Enabled" and hostData[0].state == "Up":
break
sys.stdout.write(".")
sys.stdout.flush()
time.sleep(5)
# Print cluster info
sys.stdout.write("\033[F")
message = "Host %s is connected to Cosmic, Enabled and in Up state" % host.name
c.print_message(message=message, message_type="Note", to_slack=True)
# Start all VM's with migration policy ShutdownAndStart
message = "Starting all VM's on Host %s with ShutdownAndStart policy" % host.name
c.print_message(message=message, message_type="Note", to_slack=True)
c.startVmsWithShutPolicy()
message = "Gathering some info about hypervisors in cluster '%s'" % clustername
c.print_message(message=message, message_type="Note", to_slack=False)
c.printHypervisors(clusterID, False, checkBonds, "KVM")
# Print cluster info
message = "Some info about cluster '" + clustername + "':"
c.print_message(message=message, message_type="Note", to_slack=False)
c.printCluster(clusterID, "KVM")
# Disconnect
disconnect_all()
# Done
# End time
message = "Finished @ " + time.strftime("%Y-%m-%d %H:%M")
c.print_message(message=message, message_type="Note", to_slack=False)
elapsed_time = datetime.now() - start_time
message = "We're done with cluster %s. Rebooting cluster took %s seconds." % (clustername,
str(elapsed_time.total_seconds()))
c.print_message(message=message, message_type="Note", to_slack=True)