diff --git a/Makefile b/Makefile index d07a8ca3..42cdfa93 100755 --- a/Makefile +++ b/Makefile @@ -8,6 +8,7 @@ SM_DRIVERS += udev SM_DRIVERS += ISO SM_DRIVERS += HBA SM_DRIVERS += RawHBA +SM_DRIVERS += Linstor SM_DRIVERS += LVHD SM_DRIVERS += LVHDoISCSI SM_DRIVERS += LVHDoHBA @@ -17,6 +18,12 @@ SM_DRIVERS += OCFSoHBA SM_DRIVERS += SHM SM_DRIVERS += SMB SM_DRIVERS += LVHDoFCoE +SM_DRIVERS += CephFS +SM_DRIVERS += GlusterFS +SM_DRIVERS += XFS +SM_DRIVERS += ZFS +SM_DRIVERS += EXT4 +SM_DRIVERS += MooseFS SM_LIBS := SR SM_LIBS += SRCommand @@ -30,6 +37,9 @@ SM_LIBS += verifyVHDsOnSR SM_LIBS += scsiutil SM_LIBS += scsi_host_rescan SM_LIBS += vhdutil +SM_LIBS += linstorjournaler +SM_LIBS += linstorvhdutil +SM_LIBS += linstorvolumemanager SM_LIBS += lvhdutil SM_LIBS += cifutils SM_LIBS += xs_errors @@ -68,6 +78,7 @@ SM_LIBS += sr_health_check UDEV_RULES = 65-multipath 55-xs-mpath-scsidev 57-usb 58-xapi MPATH_DAEMON = sm-multipath MPATH_CONF = multipath.conf +MPATH_CUSTOM_CONF = custom.conf SMLOG_CONF = SMlog SM_XML := XE_SR_ERRORCODES @@ -80,9 +91,11 @@ PLUGIN_SCRIPT_DEST := /etc/xapi.d/plugins/ LIBEXEC := /opt/xensource/libexec/ UDEV_RULES_DIR := /etc/udev/rules.d/ UDEV_SCRIPTS_DIR := /etc/udev/scripts/ +SYSTEMD_CONF_DIR := /etc/systemd/system/ SYSTEMD_SERVICE_DIR := /usr/lib/systemd/system/ INIT_DIR := /etc/rc.d/init.d/ MPATH_CONF_DIR := /etc/multipath.xenserver/ +MPATH_CUSTOM_CONF_DIR := /etc/multipath/conf.d/ MODPROBE_DIR := /etc/modprobe.d/ EXTENSION_SCRIPT_DEST := /etc/xapi.d/extensions/ LOGROTATE_DIR := /etc/logrotate.d/ @@ -95,6 +108,7 @@ SM_PY_FILES = $(foreach LIB, $(SM_LIBS), drivers/$(LIB).py) $(foreach DRIVER, $( .PHONY: build build: make -C dcopy + make -C linstor .PHONY: precommit precommit: build @@ -133,8 +147,12 @@ install: precheck mkdir -p $(SM_STAGING)$(UDEV_RULES_DIR) mkdir -p $(SM_STAGING)$(UDEV_SCRIPTS_DIR) mkdir -p $(SM_STAGING)$(INIT_DIR) + mkdir -p $(SM_STAGING)$(SYSTEMD_CONF_DIR) + mkdir -p $(SM_STAGING)$(SYSTEMD_CONF_DIR)/drbd-reactor.service.d + mkdir -p $(SM_STAGING)$(SYSTEMD_CONF_DIR)/linstor-satellite.service.d mkdir -p $(SM_STAGING)$(SYSTEMD_SERVICE_DIR) mkdir -p $(SM_STAGING)$(MPATH_CONF_DIR) + mkdir -p $(SM_STAGING)$(MPATH_CUSTOM_CONF_DIR) mkdir -p $(SM_STAGING)$(MODPROBE_DIR) mkdir -p $(SM_STAGING)$(LOGROTATE_DIR) mkdir -p $(SM_STAGING)$(DEBUG_DEST) @@ -152,12 +170,20 @@ install: precheck $(SM_STAGING)$(SM_DEST)/plugins/ install -m 644 multipath/$(MPATH_CONF) \ $(SM_STAGING)/$(MPATH_CONF_DIR) + install -m 644 multipath/$(MPATH_CUSTOM_CONF) \ + $(SM_STAGING)/$(MPATH_CUSTOM_CONF_DIR) install -m 755 multipath/sm-multipath \ $(SM_STAGING)/$(INIT_DIR) install -m 755 multipath/multipath-root-setup \ $(SM_STAGING)/$(SM_DEST) install -m 644 etc/logrotate.d/$(SMLOG_CONF) \ $(SM_STAGING)/$(LOGROTATE_DIR) + install -m 644 etc/systemd/system/drbd-reactor.service.d/override.conf \ + $(SM_STAGING)/$(SYSTEMD_CONF_DIR)/drbd-reactor.service.d/ + install -m 644 etc/systemd/system/linstor-satellite.service.d/override.conf \ + $(SM_STAGING)/$(SYSTEMD_CONF_DIR)/linstor-satellite.service.d/ + install -m 644 etc/systemd/system/var-lib-linstor.service \ + $(SM_STAGING)/$(SYSTEMD_CONF_DIR) install -m 644 etc/make-dummy-sr.service \ $(SM_STAGING)/$(SYSTEMD_SERVICE_DIR) install -m 644 systemd/xs-sm.service \ @@ -174,6 +200,8 @@ install: precheck $(SM_STAGING)/$(SYSTEMD_SERVICE_DIR) install -m 644 systemd/sr_health_check.timer \ $(SM_STAGING)/$(SYSTEMD_SERVICE_DIR) + install -m 644 systemd/linstor-monitor.service \ + $(SM_STAGING)/$(SYSTEMD_SERVICE_DIR) for i in $(UDEV_RULES); do \ install -m 644 udev/$$i.rules \ $(SM_STAGING)$(UDEV_RULES_DIR); done @@ -194,6 +222,7 @@ install: precheck cd $(SM_STAGING)$(SM_DEST) && rm -f OCFSoHBASR ln -sf $(SM_DEST)mpathutil.py $(SM_STAGING)/sbin/mpathutil install -m 755 drivers/02-vhdcleanup $(SM_STAGING)$(MASTER_SCRIPT_DEST) + install -m 755 drivers/linstor-manager $(SM_STAGING)$(PLUGIN_SCRIPT_DEST) install -m 755 drivers/lvhd-thin $(SM_STAGING)$(PLUGIN_SCRIPT_DEST) install -m 755 drivers/on_slave.py $(SM_STAGING)$(PLUGIN_SCRIPT_DEST)/on-slave install -m 755 drivers/testing-hooks $(SM_STAGING)$(PLUGIN_SCRIPT_DEST) @@ -206,12 +235,16 @@ install: precheck install -m 755 drivers/iscsilib.py $(SM_STAGING)$(SM_DEST) install -m 755 drivers/fcoelib.py $(SM_STAGING)$(SM_DEST) mkdir -p $(SM_STAGING)$(LIBEXEC) + install -m 755 scripts/fork-log-daemon $(SM_STAGING)$(LIBEXEC) + install -m 755 scripts/linstor-kv-tool $(SM_STAGING)$(BIN_DEST) + install -m 755 scripts/safe-umount $(SM_STAGING)$(LIBEXEC) install -m 755 scripts/local-device-change $(SM_STAGING)$(LIBEXEC) install -m 755 scripts/check-device-sharing $(SM_STAGING)$(LIBEXEC) install -m 755 scripts/usb_change $(SM_STAGING)$(LIBEXEC) install -m 755 scripts/kickpipe $(SM_STAGING)$(LIBEXEC) install -m 755 scripts/set-iscsi-initiator $(SM_STAGING)$(LIBEXEC) $(MAKE) -C dcopy install DESTDIR=$(SM_STAGING) + $(MAKE) -C linstor install DESTDIR=$(SM_STAGING) ln -sf $(SM_DEST)blktap2.py $(SM_STAGING)$(BIN_DEST)/blktap2 ln -sf $(SM_DEST)lcache.py $(SM_STAGING)$(BIN_DEST)tapdisk-cache-stats ln -sf /dev/null $(SM_STAGING)$(UDEV_RULES_DIR)/69-dm-lvm-metad.rules @@ -225,4 +258,3 @@ install: precheck .PHONY: clean clean: rm -rf $(SM_STAGING) - diff --git a/drivers/CephFSSR.py b/drivers/CephFSSR.py new file mode 100644 index 00000000..09c928be --- /dev/null +++ b/drivers/CephFSSR.py @@ -0,0 +1,296 @@ +#!/usr/bin/env python +# +# Original work copyright (C) Citrix systems +# Modified work copyright (C) Vates SAS and XCP-ng community +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published +# by the Free Software Foundation; version 2.1 only. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +# +# CEPHFSSR: Based on FileSR, mounts ceph fs share + +import errno +import os +import syslog as _syslog +import xmlrpclib +from syslog import syslog + +# careful with the import order here +# FileSR has a circular dependency: +# FileSR -> blktap2 -> lvutil -> EXTSR -> FileSR +# importing in this order seems to avoid triggering the issue. +import SR +import SRCommand +import FileSR +# end of careful +import cleanup +import util +import vhdutil +import xs_errors +from lock import Lock + +CAPABILITIES = ["SR_PROBE", "SR_UPDATE", + "VDI_CREATE", "VDI_DELETE", "VDI_ATTACH", "VDI_DETACH", + "VDI_UPDATE", "VDI_CLONE", "VDI_SNAPSHOT", "VDI_RESIZE", "VDI_MIRROR", + "VDI_GENERATE_CONFIG", + "VDI_RESET_ON_BOOT/2", "ATOMIC_PAUSE"] + +CONFIGURATION = [ + ['server', 'Ceph server(s) (required, ex: "192.168.0.12" or "10.10.10.10,10.10.10.26")'], + ['serverpath', 'Ceph FS path (required, ex: "/")'], + ['serverport', 'ex: 6789'], + ['options', 'Ceph FS client name, and secretfile (required, ex: "name=admin,secretfile=/etc/ceph/admin.secret")'] +] + +DRIVER_INFO = { + 'name': 'CephFS VHD', + 'description': 'SR plugin which stores disks as VHD files on a CephFS storage', + 'vendor': 'Vates SAS', + 'copyright': '(C) 2020 Vates SAS', + 'driver_version': '1.0', + 'required_api_version': '1.0', + 'capabilities': CAPABILITIES, + 'configuration': CONFIGURATION +} + +DRIVER_CONFIG = {"ATTACH_FROM_CONFIG_WITH_TAPDISK": True} + +# The mountpoint for the directory when performing an sr_probe. All probes +# are guaranteed to be serialised by xapi, so this single mountpoint is fine. +PROBE_MOUNTPOINT = os.path.join(SR.MOUNT_BASE, "probe") + + +class CephFSException(Exception): + def __init__(self, errstr): + self.errstr = errstr + + +# mountpoint = /var/run/sr-mount/CephFS/uuid +# linkpath = mountpoint/uuid - path to SR directory on share +# path = /var/run/sr-mount/uuid - symlink to SR directory on share +class CephFSSR(FileSR.FileSR): + """Ceph file-based storage repository""" + + DRIVER_TYPE = 'cephfs' + + def handles(sr_type): + # fudge, because the parent class (FileSR) checks for smb to alter its behavior + return sr_type == CephFSSR.DRIVER_TYPE or sr_type == 'smb' + + handles = staticmethod(handles) + + def load(self, sr_uuid): + if not self._is_ceph_available(): + raise xs_errors.XenError( + 'SRUnavailable', + opterr='ceph is not installed' + ) + + self.ops_exclusive = FileSR.OPS_EXCLUSIVE + self.lock = Lock(vhdutil.LOCK_TYPE_SR, self.uuid) + self.sr_vditype = SR.DEFAULT_TAP + self.driver_config = DRIVER_CONFIG + if 'server' not in self.dconf: + raise xs_errors.XenError('ConfigServerMissing') + self.remoteserver = self.dconf['server'] + self.remotepath = self.dconf['serverpath'] + # if serverport is not specified, use default 6789 + if 'serverport' not in self.dconf: + self.remoteport = "6789" + else: + self.remoteport = self.dconf['serverport'] + if self.sr_ref and self.session is not None: + self.sm_config = self.session.xenapi.SR.get_sm_config(self.sr_ref) + else: + self.sm_config = self.srcmd.params.get('sr_sm_config') or {} + self.mountpoint = os.path.join(SR.MOUNT_BASE, 'CephFS', sr_uuid) + self.linkpath = os.path.join(self.mountpoint, sr_uuid or "") + self.path = os.path.join(SR.MOUNT_BASE, sr_uuid) + self._check_o_direct() + + def checkmount(self): + return util.ioretry(lambda: ((util.pathexists(self.mountpoint) and + util.ismount(self.mountpoint)) and + util.pathexists(self.path))) + + def mount(self, mountpoint=None): + """Mount the remote ceph export at 'mountpoint'""" + if mountpoint is None: + mountpoint = self.mountpoint + elif not util.is_string(mountpoint) or mountpoint == "": + raise CephFSException("mountpoint not a string object") + + try: + if not util.ioretry(lambda: util.isdir(mountpoint)): + util.ioretry(lambda: util.makedirs(mountpoint)) + except util.CommandException, inst: + raise CephFSException("Failed to make directory: code is %d" % inst.code) + + try: + options = [] + if self.dconf.has_key('options'): + options.append(self.dconf['options']) + if options: + options = ['-o', ','.join(options)] + command = ["mount", '-t', 'ceph', self.remoteserver+":"+self.remoteport+":"+self.remotepath, mountpoint] + options + util.ioretry(lambda: util.pread(command), errlist=[errno.EPIPE, errno.EIO], maxretry=2, nofail=True) + except util.CommandException, inst: + syslog(_syslog.LOG_ERR, 'CephFS mount failed ' + inst.__str__()) + raise CephFSException("mount failed with return code %d" % inst.code) + + # Sanity check to ensure that the user has at least RO access to the + # mounted share. Windows sharing and security settings can be tricky. + try: + util.listdir(mountpoint) + except util.CommandException: + try: + self.unmount(mountpoint, True) + except CephFSException: + util.logException('CephFSSR.unmount()') + raise CephFSException("Permission denied. Please check user privileges.") + + def unmount(self, mountpoint, rmmountpoint): + try: + util.pread(["umount", mountpoint]) + except util.CommandException, inst: + raise CephFSException("umount failed with return code %d" % inst.code) + if rmmountpoint: + try: + os.rmdir(mountpoint) + except OSError, inst: + raise CephFSException("rmdir failed with error '%s'" % inst.strerror) + + def attach(self, sr_uuid): + if not self.checkmount(): + try: + self.mount() + os.symlink(self.linkpath, self.path) + except CephFSException, exc: + raise SR.SROSError(12, exc.errstr) + self.attached = True + + def probe(self): + try: + self.mount(PROBE_MOUNTPOINT) + sr_list = filter(util.match_uuid, util.listdir(PROBE_MOUNTPOINT)) + self.unmount(PROBE_MOUNTPOINT, True) + except (util.CommandException, xs_errors.XenError): + raise + # Create a dictionary from the SR uuids to feed SRtoXML() + sr_dict = {sr_uuid: {} for sr_uuid in sr_list} + return util.SRtoXML(sr_dict) + + def detach(self, sr_uuid): + if not self.checkmount(): + return + util.SMlog("Aborting GC/coalesce") + cleanup.abort(self.uuid) + # Change directory to avoid unmount conflicts + os.chdir(SR.MOUNT_BASE) + self.unmount(self.mountpoint, True) + os.unlink(self.path) + self.attached = False + + def create(self, sr_uuid, size): + if self.checkmount(): + raise SR.SROSError(113, 'CephFS mount point already attached') + + try: + self.mount() + except CephFSException, exc: + # noinspection PyBroadException + try: + os.rmdir(self.mountpoint) + except: + # we have no recovery strategy + pass + raise SR.SROSError(111, "CephFS mount error [opterr=%s]" % exc.errstr) + + if util.ioretry(lambda: util.pathexists(self.linkpath)): + if len(util.ioretry(lambda: util.listdir(self.linkpath))) != 0: + self.detach(sr_uuid) + raise xs_errors.XenError('SRExists') + else: + try: + util.ioretry(lambda: util.makedirs(self.linkpath)) + os.symlink(self.linkpath, self.path) + except util.CommandException, inst: + if inst.code != errno.EEXIST: + try: + self.unmount(self.mountpoint, True) + except CephFSException: + util.logException('CephFSSR.unmount()') + raise SR.SROSError(116, + "Failed to create CephFS SR. remote directory creation error: {}".format( + os.strerror(inst.code))) + self.detach(sr_uuid) + + def delete(self, sr_uuid): + # try to remove/delete non VDI contents first + super(CephFSSR, self).delete(sr_uuid) + try: + if self.checkmount(): + self.detach(sr_uuid) + self.mount() + if util.ioretry(lambda: util.pathexists(self.linkpath)): + util.ioretry(lambda: os.rmdir(self.linkpath)) + util.SMlog(str(self.unmount(self.mountpoint, True))) + except util.CommandException, inst: + self.detach(sr_uuid) + if inst.code != errno.ENOENT: + raise SR.SROSError(114, "Failed to remove CephFS mount point") + + def vdi(self, uuid, loadLocked=False): + return CephFSFileVDI(self, uuid) + + @staticmethod + def _is_ceph_available(): + import distutils.spawn + return distutils.spawn.find_executable('ceph') + +class CephFSFileVDI(FileSR.FileVDI): + def attach(self, sr_uuid, vdi_uuid): + if not hasattr(self, 'xenstore_data'): + self.xenstore_data = {} + + self.xenstore_data['storage-type'] = CephFSSR.DRIVER_TYPE + + return super(CephFSFileVDI, self).attach(sr_uuid, vdi_uuid) + + def generate_config(self, sr_uuid, vdi_uuid): + util.SMlog("SMBFileVDI.generate_config") + if not util.pathexists(self.path): + raise xs_errors.XenError('VDIUnavailable') + resp = {'device_config': self.sr.dconf, + 'sr_uuid': sr_uuid, + 'vdi_uuid': vdi_uuid, + 'sr_sm_config': self.sr.sm_config, + 'command': 'vdi_attach_from_config'} + # Return the 'config' encoded within a normal XMLRPC response so that + # we can use the regular response/error parsing code. + config = xmlrpclib.dumps(tuple([resp]), "vdi_attach_from_config") + return xmlrpclib.dumps((config,), "", True) + + def attach_from_config(self, sr_uuid, vdi_uuid): + try: + if not util.pathexists(self.sr.path): + self.sr.attach(sr_uuid) + except: + util.logException("SMBFileVDI.attach_from_config") + raise xs_errors.XenError('SRUnavailable', + opterr='Unable to attach from config') + + +if __name__ == '__main__': + SRCommand.run(CephFSSR, DRIVER_INFO) +else: + SR.registerSR(CephFSSR) diff --git a/drivers/EXT4SR.py b/drivers/EXT4SR.py new file mode 100644 index 00000000..bd67d003 --- /dev/null +++ b/drivers/EXT4SR.py @@ -0,0 +1,235 @@ +#!/usr/bin/env python +# +# Original work copyright (C) Citrix systems +# Modified work copyright (C) Vates SAS and XCP-ng community +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published +# by the Free Software Foundation; version 2.1 only. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +# +# EXT4SR: Based on local-file storage repository, mounts ext4 partition + +import SR, SRCommand, FileSR, util, lvutil, scsiutil + +import os +import xs_errors +import vhdutil +from lock import Lock +from constants import EXT_PREFIX + +CAPABILITIES = ["SR_PROBE","SR_UPDATE", "SR_SUPPORTS_LOCAL_CACHING", \ + "VDI_CREATE","VDI_DELETE","VDI_ATTACH","VDI_DETACH", \ + "VDI_UPDATE","VDI_CLONE","VDI_SNAPSHOT","VDI_RESIZE","VDI_MIRROR", \ + "VDI_GENERATE_CONFIG", \ + "VDI_RESET_ON_BOOT/2","ATOMIC_PAUSE", "VDI_CONFIG_CBT", + "VDI_ACTIVATE", "VDI_DEACTIVATE"] + +CONFIGURATION = [ [ 'device', 'local device path (required) (e.g. /dev/sda3)' ] ] + +DRIVER_INFO = { + 'name': 'Local EXT4 VHD', + 'description': 'SR plugin which represents disks as VHD files stored on a local EXT4 filesystem, created inside an LVM volume', + 'vendor': 'Vates SAS', + 'copyright': '(C) 2019 Vates SAS', + 'driver_version': '1.0', + 'required_api_version': '1.0', + 'capabilities': CAPABILITIES, + 'configuration': CONFIGURATION + } + +DRIVER_CONFIG = {"ATTACH_FROM_CONFIG_WITH_TAPDISK": True} + +class EXT4SR(FileSR.FileSR): + """EXT4 Local file storage repository""" + def handles(srtype): + return srtype == 'ext4' + handles = staticmethod(handles) + + def load(self, sr_uuid): + self.ops_exclusive = FileSR.OPS_EXCLUSIVE + self.lock = Lock(vhdutil.LOCK_TYPE_SR, self.uuid) + self.sr_vditype = SR.DEFAULT_TAP + if not self.dconf.has_key('device') or not self.dconf['device']: + raise xs_errors.XenError('ConfigDeviceMissing') + + self.root = self.dconf['device'] + self.path = os.path.join(SR.MOUNT_BASE, sr_uuid) + self.vgname = EXT_PREFIX + sr_uuid + self.remotepath = os.path.join("/dev",self.vgname,sr_uuid) + self.attached = self._checkmount() + self.driver_config = DRIVER_CONFIG + + def delete(self, sr_uuid): + super(EXT4SR, self).delete(sr_uuid) + + # Check PVs match VG + try: + for dev in self.root.split(','): + cmd = ["pvs", dev] + txt = util.pread2(cmd) + if txt.find(self.vgname) == -1: + raise xs_errors.XenError('VolNotFound', \ + opterr='volume is %s' % self.vgname) + except util.CommandException, inst: + raise xs_errors.XenError('PVSfailed', \ + opterr='error is %d' % inst.code) + + # Remove LV, VG and pv + try: + cmd = ["lvremove", "-f", self.remotepath] + util.pread2(cmd) + + cmd = ["vgremove", self.vgname] + util.pread2(cmd) + + for dev in self.root.split(','): + cmd = ["pvremove", dev] + util.pread2(cmd) + except util.CommandException, inst: + raise xs_errors.XenError('LVMDelete', \ + opterr='errno is %d' % inst.code) + + def attach(self, sr_uuid): + if not self._checkmount(): + try: + #Activate LV + cmd = ['lvchange','-ay',self.remotepath] + util.pread2(cmd) + + # make a mountpoint: + if not os.path.isdir(self.path): + os.makedirs(self.path) + except util.CommandException, inst: + raise xs_errors.XenError('LVMMount', \ + opterr='Unable to activate LV. Errno is %d' % inst.code) + + try: + util.pread(["fsck", "-a", self.remotepath]) + except util.CommandException, inst: + if inst.code == 1: + util.SMlog("FSCK detected and corrected FS errors. Not fatal.") + else: + raise xs_errors.XenError('LVMMount', \ + opterr='FSCK failed on %s. Errno is %d' % (self.remotepath,inst.code)) + + try: + util.pread(["mount", self.remotepath, self.path]) + except util.CommandException, inst: + raise xs_errors.XenError('LVMMount', \ + opterr='Failed to mount FS. Errno is %d' % inst.code) + + self.attached = True + + #Update SCSIid string + scsiutil.add_serial_record(self.session, self.sr_ref, \ + scsiutil.devlist_to_serialstring(self.root.split(','))) + + # Set the block scheduler + for dev in self.root.split(','): self.block_setscheduler(dev) + + def detach(self, sr_uuid): + super(EXT4SR, self).detach(sr_uuid) + try: + # deactivate SR + cmd = ["lvchange", "-an", self.remotepath] + util.pread2(cmd) + except util.CommandException, inst: + raise xs_errors.XenError('LVMUnMount', \ + opterr='lvm -an failed errno is %d' % inst.code) + + def probe(self): + return lvutil.srlist_toxml(lvutil.scan_srlist(EXT_PREFIX, self.root), + EXT_PREFIX) + + def create(self, sr_uuid, size): + # THIS DRIVER IS DEPRECATED. RAISE. + raise Exception('The `ext4` SR type is deprecated since XCP-ng 8.1.\n' + 'Use the main `ext` driver instead. It will create an EXT4 filesystem now, ' + 'not EXT3 anymore as it used to.') + + if self._checkmount(): + raise xs_errors.XenError('SRExists') + + # Check none of the devices already in use by other PBDs + if util.test_hostPBD_devs(self.session, sr_uuid, self.root): + raise xs_errors.XenError('SRInUse') + + # Check serial number entry in SR records + for dev in self.root.split(','): + if util.test_scsiserial(self.session, dev): + raise xs_errors.XenError('SRInUse') + + if not lvutil._checkVG(self.vgname): + lvutil.createVG(self.root, self.vgname) + + if lvutil._checkLV(self.remotepath): + raise xs_errors.XenError('SRExists') + + try: + numdevs = len(self.root.split(',')) + cmd = ["lvcreate", "-n", sr_uuid] + if numdevs > 1: + lowest = -1 + for dev in self.root.split(','): + stats = lvutil._getPVstats(dev) + if lowest < 0 or stats['freespace'] < lowest: + lowest = stats['freespace'] + size_mb = (lowest / (1024 * 1024)) * numdevs + + # Add stripe parameter to command + cmd += ["-i", str(numdevs), "-I", "2048"] + else: + stats = lvutil._getVGstats(self.vgname) + size_mb = stats['freespace'] / (1024 * 1024) + assert(size_mb > 0) + cmd += ["-L", str(size_mb), self.vgname] + text = util.pread(cmd) + + cmd = ["lvchange", "-ay", self.remotepath] + text = util.pread(cmd) + except util.CommandException, inst: + raise xs_errors.XenError('LVMCreate', \ + opterr='lv operation, error %d' % inst.code) + except AssertionError: + raise xs_errors.XenError('SRNoSpace', \ + opterr='Insufficient space in VG %s' % self.vgname) + + try: + util.pread2(["mkfs.ext4", "-F", self.remotepath]) + except util.CommandException, inst: + raise xs_errors.XenError('LVMFilesystem', \ + opterr='mkfs failed error %d' % inst.code) + + #Update serial number string + scsiutil.add_serial_record(self.session, self.sr_ref, \ + scsiutil.devlist_to_serialstring(self.root.split(','))) + + def vdi(self, uuid, loadLocked = False): + if not loadLocked: + return EXTFileVDI(self, uuid) + return EXTFileVDI(self, uuid) + + +class EXTFileVDI(FileSR.FileVDI): + def attach(self, sr_uuid, vdi_uuid): + if not hasattr(self,'xenstore_data'): + self.xenstore_data = {} + + self.xenstore_data["storage-type"]="ext" + + return super(EXTFileVDI, self).attach(sr_uuid, vdi_uuid) + + +if __name__ == '__main__': + SRCommand.run(EXT4SR, DRIVER_INFO) +else: + SR.registerSR(EXT4SR) \ No newline at end of file diff --git a/drivers/EXTSR.py b/drivers/EXTSR.py index 4caa09df..da43082e 100755 --- a/drivers/EXTSR.py +++ b/drivers/EXTSR.py @@ -18,6 +18,7 @@ # EXTSR: Based on local-file storage repository, mounts ext3 partition import SR, SRCommand, FileSR, util, lvutil, scsiutil +from SR import deviceCheck import os import xs_errors @@ -141,10 +142,12 @@ def detach(self, sr_uuid): raise xs_errors.XenError('LVMUnMount', \ opterr='lvm -an failed errno is %d' % inst.code) + @deviceCheck def probe(self): return lvutil.srlist_toxml(lvutil.scan_srlist(EXT_PREFIX, self.dconf['device']), EXT_PREFIX) + @deviceCheck def create(self, sr_uuid, size): if self._checkmount(): raise xs_errors.XenError('SRExists') diff --git a/drivers/GlusterFSSR.py b/drivers/GlusterFSSR.py new file mode 100644 index 00000000..61a7d409 --- /dev/null +++ b/drivers/GlusterFSSR.py @@ -0,0 +1,288 @@ +#!/usr/bin/env python +# +# Original work copyright (C) Citrix systems +# Modified work copyright (C) Vates SAS and XCP-ng community +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published +# by the Free Software Foundation; version 2.1 only. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +import errno +import os +import syslog as _syslog +import xmlrpclib +from syslog import syslog + +# careful with the import order here +# FileSR has a circular dependency: FileSR- > blktap2 -> lvutil -> EXTSR -> FileSR +# importing in this order seems to avoid triggering the issue. +import SR +import SRCommand +import FileSR +# end of careful +import cleanup +import util +import vhdutil +import xs_errors +from lock import Lock + +CAPABILITIES = ["SR_PROBE", "SR_UPDATE", + "VDI_CREATE", "VDI_DELETE", "VDI_ATTACH", "VDI_DETACH", + "VDI_UPDATE", "VDI_CLONE", "VDI_SNAPSHOT", "VDI_RESIZE", "VDI_MIRROR", + "VDI_GENERATE_CONFIG", + "VDI_RESET_ON_BOOT/2", "ATOMIC_PAUSE"] + +CONFIGURATION = [['server', 'Full path to share on gluster server (required, ex: "192.168.0.12:/gv0")'], + ['backupservers', 'list of servers separated by ":"'], + ['fetchattempts', 'number of attempts to fetch files before switching to the backup server'] + ] + +DRIVER_INFO = { + 'name': 'GlusterFS VHD', + 'description': 'SR plugin which stores disks as VHD files on a GlusterFS storage', + 'vendor': 'Vates SAS', + 'copyright': '(C) 2020 Vates SAS', + 'driver_version': '1.0', + 'required_api_version': '1.0', + 'capabilities': CAPABILITIES, + 'configuration': CONFIGURATION +} + +DRIVER_CONFIG = {"ATTACH_FROM_CONFIG_WITH_TAPDISK": True} + +# The mountpoint for the directory when performing an sr_probe. All probes +# are guaranteed to be serialised by xapi, so this single mountpoint is fine. +PROBE_MOUNTPOINT = os.path.join(SR.MOUNT_BASE, "probe") + + +class GlusterFSException(Exception): + def __init__(self, errstr): + self.errstr = errstr + + +# mountpoint = /var/run/sr-mount/GlusterFS//uuid +# linkpath = mountpoint/uuid - path to SR directory on share +# path = /var/run/sr-mount/uuid - symlink to SR directory on share +class GlusterFSSR(FileSR.FileSR): + """Gluster file-based storage repository""" + + DRIVER_TYPE = 'glusterfs' + + def handles(sr_type): + # fudge, because the parent class (FileSR) checks for smb to alter its behavior + return sr_type == GlusterFSSR.DRIVER_TYPE or sr_type == 'smb' + + handles = staticmethod(handles) + + def load(self, sr_uuid): + if not self._is_glusterfs_available(): + raise xs_errors.XenError( + 'SRUnavailable', + opterr='glusterfs is not installed' + ) + + self.ops_exclusive = FileSR.OPS_EXCLUSIVE + self.lock = Lock(vhdutil.LOCK_TYPE_SR, self.uuid) + self.sr_vditype = SR.DEFAULT_TAP + self.driver_config = DRIVER_CONFIG + if 'server' not in self.dconf: + raise xs_errors.XenError('ConfigServerMissing') + # Can be None => on-slave plugin hack (is_open function). + self.remoteserver = self.dconf['server'] or '' + if self.sr_ref and self.session is not None: + self.sm_config = self.session.xenapi.SR.get_sm_config(self.sr_ref) + else: + self.sm_config = self.srcmd.params.get('sr_sm_config') or {} + self.mountpoint = os.path.join(SR.MOUNT_BASE, 'GlusterFS', self.remoteserver.split(':')[0], sr_uuid) + self.linkpath = os.path.join(self.mountpoint, sr_uuid or "") + self.path = os.path.join(SR.MOUNT_BASE, sr_uuid) + self._check_o_direct() + + def checkmount(self): + return util.ioretry(lambda: ((util.pathexists(self.mountpoint) and + util.ismount(self.mountpoint)) and + util.pathexists(self.linkpath))) + + def mount(self, mountpoint=None): + """Mount the remote gluster export at 'mountpoint'""" + if mountpoint is None: + mountpoint = self.mountpoint + elif not util.is_string(mountpoint) or mountpoint == "": + raise GlusterFSException("mountpoint not a string object") + + try: + if not util.ioretry(lambda: util.isdir(mountpoint)): + util.ioretry(lambda: util.makedirs(mountpoint)) + except util.CommandException, inst: + raise GlusterFSException("Failed to make directory: code is %d" % inst.code) + try: + options = [] + if 'backupservers' in self.dconf: + options.append('backup-volfile-servers=' + self.dconf['backupservers']) + if 'fetchattempts' in self.dconf: + options.append('fetch-attempts=' + self.dconf['fetchattempts']) + if options: + options = ['-o', ','.join(options)] + command = ["mount", '-t', 'glusterfs', self.remoteserver, mountpoint] + options + util.ioretry(lambda: util.pread(command), errlist=[errno.EPIPE, errno.EIO], maxretry=2, nofail=True) + except util.CommandException, inst: + syslog(_syslog.LOG_ERR, 'GlusterFS mount failed ' + inst.__str__()) + raise GlusterFSException("mount failed with return code %d" % inst.code) + + # Sanity check to ensure that the user has at least RO access to the + # mounted share. Windows sharing and security settings can be tricky. + try: + util.listdir(mountpoint) + except util.CommandException: + try: + self.unmount(mountpoint, True) + except GlusterFSException: + util.logException('GlusterFSSR.unmount()') + raise GlusterFSException("Permission denied. Please check user privileges.") + + def unmount(self, mountpoint, rmmountpoint): + try: + util.pread(["umount", mountpoint]) + except util.CommandException, inst: + raise GlusterFSException("umount failed with return code %d" % inst.code) + if rmmountpoint: + try: + os.rmdir(mountpoint) + except OSError, inst: + raise GlusterFSException("rmdir failed with error '%s'" % inst.strerror) + + def attach(self, sr_uuid): + if not self.checkmount(): + try: + self.mount() + os.symlink(self.linkpath, self.path) + except GlusterFSException, exc: + raise SR.SROSError(12, exc.errstr) + self.attached = True + + def probe(self): + try: + self.mount(PROBE_MOUNTPOINT) + sr_list = filter(util.match_uuid, util.listdir(PROBE_MOUNTPOINT)) + self.unmount(PROBE_MOUNTPOINT, True) + except (util.CommandException, xs_errors.XenError): + raise + # Create a dictionary from the SR uuids to feed SRtoXML() + sr_dict = {sr_uuid: {} for sr_uuid in sr_list} + return util.SRtoXML(sr_dict) + + def detach(self, sr_uuid): + if not self.checkmount(): + return + util.SMlog("Aborting GC/coalesce") + cleanup.abort(self.uuid) + # Change directory to avoid unmount conflicts + os.chdir(SR.MOUNT_BASE) + self.unmount(self.mountpoint, True) + os.unlink(self.path) + self.attached = False + + def create(self, sr_uuid, size): + if self.checkmount(): + raise SR.SROSError(113, 'GlusterFS mount point already attached') + + try: + self.mount() + except GlusterFSException, exc: + # noinspection PyBroadException + try: + os.rmdir(self.mountpoint) + except: + # we have no recovery strategy + pass + raise SR.SROSError(111, "GlusterFS mount error [opterr=%s]" % exc.errstr) + + if util.ioretry(lambda: util.pathexists(self.linkpath)): + if len(util.ioretry(lambda: util.listdir(self.linkpath))) != 0: + self.detach(sr_uuid) + raise xs_errors.XenError('SRExists') + else: + try: + util.ioretry(lambda: util.makedirs(self.linkpath)) + os.symlink(self.linkpath, self.path) + except util.CommandException, inst: + if inst.code != errno.EEXIST: + try: + self.unmount(self.mountpoint, True) + except GlusterFSException: + util.logException('GlusterFSSR.unmount()') + raise SR.SROSError(116, + "Failed to create GlusterFS SR. remote directory creation error: {}".format( + os.strerror(inst.code))) + self.detach(sr_uuid) + + def delete(self, sr_uuid): + # try to remove/delete non VDI contents first + super(GlusterFSSR, self).delete(sr_uuid) + try: + if self.checkmount(): + self.detach(sr_uuid) + self.mount() + if util.ioretry(lambda: util.pathexists(self.linkpath)): + util.ioretry(lambda: os.rmdir(self.linkpath)) + self.unmount(self.mountpoint, True) + except util.CommandException, inst: + self.detach(sr_uuid) + if inst.code != errno.ENOENT: + raise SR.SROSError(114, "Failed to remove GlusterFS mount point") + + def vdi(self, uuid, loadLocked=False): + return GlusterFSFileVDI(self, uuid) + + @staticmethod + def _is_glusterfs_available(): + import distutils.spawn + return distutils.spawn.find_executable('glusterfs') + + +class GlusterFSFileVDI(FileSR.FileVDI): + def attach(self, sr_uuid, vdi_uuid): + if not hasattr(self, 'xenstore_data'): + self.xenstore_data = {} + + self.xenstore_data['storage-type'] = GlusterFSSR.DRIVER_TYPE + + return super(GlusterFSFileVDI, self).attach(sr_uuid, vdi_uuid) + + def generate_config(self, sr_uuid, vdi_uuid): + util.SMlog("SMBFileVDI.generate_config") + if not util.pathexists(self.path): + raise xs_errors.XenError('VDIUnavailable') + resp = {'device_config': self.sr.dconf, + 'sr_uuid': sr_uuid, + 'vdi_uuid': vdi_uuid, + 'sr_sm_config': self.sr.sm_config, + 'command': 'vdi_attach_from_config'} + # Return the 'config' encoded within a normal XMLRPC response so that + # we can use the regular response/error parsing code. + config = xmlrpclib.dumps(tuple([resp]), "vdi_attach_from_config") + return xmlrpclib.dumps((config,), "", True) + + def attach_from_config(self, sr_uuid, vdi_uuid): + try: + if not util.pathexists(self.sr.path): + self.sr.attach(sr_uuid) + except: + util.logException("SMBFileVDI.attach_from_config") + raise xs_errors.XenError('SRUnavailable', + opterr='Unable to attach from config') + + +if __name__ == '__main__': + SRCommand.run(GlusterFSSR, DRIVER_INFO) +else: + SR.registerSR(GlusterFSSR) diff --git a/drivers/ISOSR.py b/drivers/ISOSR.py index 5b126837..35085487 100755 --- a/drivers/ISOSR.py +++ b/drivers/ISOSR.py @@ -455,10 +455,9 @@ def getCacheOptions(self): def detach(self, sr_uuid): """Std. detach""" - # This handles legacy mode too, so no need to check - if not self._checkmount(): - return - + if 'legacy_mode' in self.dconf or not self._checkmount(): + return + try: util.pread(["umount", self.mountpoint]); except util.CommandException, inst: diff --git a/drivers/LVHDSR.py b/drivers/LVHDSR.py index cfee0717..6ac3f804 100755 --- a/drivers/LVHDSR.py +++ b/drivers/LVHDSR.py @@ -19,6 +19,7 @@ # import SR +from SR import deviceCheck import VDI import SRCommand import util @@ -494,7 +495,7 @@ def _expand_size(self): util.logException("LVHDSR._expand_size for %s failed to resize" " the PV" % self.uuid) - + @deviceCheck def create(self, uuid, size): util.SMlog("LVHDSR.create for %s" % self.uuid) if not self.isMaster: @@ -849,6 +850,7 @@ def _updateStats(self, uuid, virtAllocDelta): self.physical_utilisation = stats['physical_utilisation'] self._db_update() + @deviceCheck def probe(self): return lvutil.srlist_toxml( lvutil.scan_srlist(lvhdutil.VG_PREFIX, self.dconf['device']), @@ -1530,10 +1532,11 @@ def detach(self, sr_uuid, vdi_uuid): elif self.sr.provision == "thick": needDeflate = False # except for snapshots, which are always deflated - vdi_ref = self.sr.srcmd.params['vdi_ref'] - snap = self.session.xenapi.VDI.get_is_a_snapshot(vdi_ref) - if snap: - needDeflate = True + if self.sr.srcmd.cmd != 'vdi_detach_from_config': + vdi_ref = self.sr.srcmd.params['vdi_ref'] + snap = self.session.xenapi.VDI.get_is_a_snapshot(vdi_ref) + if snap: + needDeflate = True if needDeflate: try: diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py new file mode 100755 index 00000000..d3c15b6c --- /dev/null +++ b/drivers/LinstorSR.py @@ -0,0 +1,2897 @@ +#!/usr/bin/env python +# +# Copyright (C) 2020 Vates SAS - ronan.abhamon@vates.fr +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from constants import CBTLOG_TAG + +try: + from linstorjournaler import LinstorJournaler + from linstorvhdutil import LinstorVhdUtil + from linstorvolumemanager import get_controller_uri + from linstorvolumemanager import get_controller_node_name + from linstorvolumemanager import LinstorVolumeManager + from linstorvolumemanager import LinstorVolumeManagerError + + LINSTOR_AVAILABLE = True +except ImportError: + LINSTOR_AVAILABLE = False + +from lock import Lock +import blktap2 +import cleanup +import distutils +import errno +import functools +import lvutil +import os +import re +import scsiutil +import signal +import socket +import SR +import SRCommand +import subprocess +import time +import traceback +import util +import VDI +import vhdutil +import xml.etree.ElementTree as xml_parser +import xmlrpclib +import xs_errors + +from srmetadata import \ + NAME_LABEL_TAG, NAME_DESCRIPTION_TAG, IS_A_SNAPSHOT_TAG, SNAPSHOT_OF_TAG, \ + TYPE_TAG, VDI_TYPE_TAG, READ_ONLY_TAG, SNAPSHOT_TIME_TAG, \ + METADATA_OF_POOL_TAG + +HIDDEN_TAG = 'hidden' + +XHA_CONFIG_PATH = '/etc/xensource/xhad.conf' + +FORK_LOG_DAEMON = '/opt/xensource/libexec/fork-log-daemon' + +# This flag can be disabled to debug the DRBD layer. +# When this config var is False, the HA can only be used under +# specific conditions: +# - Only one heartbeat diskless VDI is present in the pool. +# - The other hearbeat volumes must be diskful and limited to a maximum of 3. +USE_HTTP_NBD_SERVERS = True + +# Useful flag to trace calls using cProfile. +TRACE_PERFS = False + +# Enable/Disable VHD key hash support. +USE_KEY_HASH = False + +# ============================================================================== + +# TODO: Supports 'VDI_INTRODUCE', 'VDI_RESET_ON_BOOT/2', 'SR_TRIM', +# 'VDI_CONFIG_CBT', 'SR_PROBE' + +CAPABILITIES = [ + 'ATOMIC_PAUSE', + 'SR_UPDATE', + 'VDI_CREATE', + 'VDI_DELETE', + 'VDI_UPDATE', + 'VDI_ATTACH', + 'VDI_DETACH', + 'VDI_ACTIVATE', + 'VDI_DEACTIVATE', + 'VDI_CLONE', + 'VDI_MIRROR', + 'VDI_RESIZE', + 'VDI_SNAPSHOT', + 'VDI_GENERATE_CONFIG' +] + +CONFIGURATION = [ + ['group-name', 'LVM group name'], + ['redundancy', 'replication count'], + ['provisioning', '"thin" or "thick" are accepted (optional, defaults to thin)'], + ['monitor-db-quorum', 'disable controller when only one host is online (optional, defaults to true)'] +] + +DRIVER_INFO = { + 'name': 'LINSTOR resources on XCP-ng', + 'description': 'SR plugin which uses Linstor to manage VDIs', + 'vendor': 'Vates', + 'copyright': '(C) 2020 Vates', + 'driver_version': '1.0', + 'required_api_version': '1.0', + 'capabilities': CAPABILITIES, + 'configuration': CONFIGURATION +} + +DRIVER_CONFIG = {'ATTACH_FROM_CONFIG_WITH_TAPDISK': False} + +OPS_EXCLUSIVE = [ + 'sr_create', 'sr_delete', 'sr_attach', 'sr_detach', 'sr_scan', + 'sr_update', 'sr_probe', 'vdi_init', 'vdi_create', 'vdi_delete', + 'vdi_attach', 'vdi_detach', 'vdi_clone', 'vdi_snapshot', +] + +# ============================================================================== +# Misc helpers used by LinstorSR and linstor-thin plugin. +# ============================================================================== + + +def compute_volume_size(virtual_size, image_type): + if image_type == vhdutil.VDI_TYPE_VHD: + # All LINSTOR VDIs have the metadata area preallocated for + # the maximum possible virtual size (for fast online VDI.resize). + meta_overhead = vhdutil.calcOverheadEmpty(LinstorVDI.MAX_SIZE) + bitmap_overhead = vhdutil.calcOverheadBitmap(virtual_size) + virtual_size += meta_overhead + bitmap_overhead + elif image_type != vhdutil.VDI_TYPE_RAW: + raise Exception('Invalid image type: {}'.format(image_type)) + + return LinstorVolumeManager.round_up_volume_size(virtual_size) + + +def attach_thin(session, journaler, linstor, sr_uuid, vdi_uuid): + volume_metadata = linstor.get_volume_metadata(vdi_uuid) + image_type = volume_metadata.get(VDI_TYPE_TAG) + if image_type == vhdutil.VDI_TYPE_RAW: + return + + device_path = linstor.get_device_path(vdi_uuid) + + # If the virtual VHD size is lower than the LINSTOR volume size, + # there is nothing to do. + vhd_size = compute_volume_size( + # TODO: Replace pylint comment with this feature when possible: + # https://github.com/PyCQA/pylint/pull/2926 + LinstorVhdUtil(session, linstor).get_size_virt(vdi_uuid), # pylint: disable = E1120 + image_type + ) + + volume_info = linstor.get_volume_info(vdi_uuid) + volume_size = volume_info.virtual_size + + if vhd_size > volume_size: + inflate( + journaler, linstor, vdi_uuid, device_path, + vhd_size, volume_size + ) + + +def detach_thin_impl(session, linstor, sr_uuid, vdi_uuid): + volume_metadata = linstor.get_volume_metadata(vdi_uuid) + image_type = volume_metadata.get(VDI_TYPE_TAG) + if image_type == vhdutil.VDI_TYPE_RAW: + return + + def check_vbd_count(): + vdi_ref = session.xenapi.VDI.get_by_uuid(vdi_uuid) + vbds = session.xenapi.VBD.get_all_records_where( + 'field "VDI" = "{}"'.format(vdi_ref) + ) + + num_plugged = 0 + for vbd_rec in vbds.values(): + if vbd_rec['currently_attached']: + num_plugged += 1 + if num_plugged > 1: + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='Cannot deflate VDI {}, already used by ' + 'at least 2 VBDs'.format(vdi_uuid) + ) + + # We can have multiple VBDs attached to a VDI during a VM-template clone. + # So we use a timeout to ensure that we can detach the volume properly. + util.retry(check_vbd_count, maxretry=10, period=1) + + device_path = linstor.get_device_path(vdi_uuid) + new_volume_size = LinstorVolumeManager.round_up_volume_size( + # TODO: Replace pylint comment with this feature when possible: + # https://github.com/PyCQA/pylint/pull/2926 + LinstorVhdUtil(session, linstor).get_size_phys(vdi_uuid) # pylint: disable = E1120 + ) + + volume_info = linstor.get_volume_info(vdi_uuid) + old_volume_size = volume_info.virtual_size + deflate( + linstor, vdi_uuid, device_path, new_volume_size, old_volume_size + ) + + +def detach_thin(session, linstor, sr_uuid, vdi_uuid): + # This function must always return without errors. + # Otherwise it could cause errors in the XAPI regarding the state of the VDI. + # It's why we use this `try` block. + try: + detach_thin_impl(session, linstor, sr_uuid, vdi_uuid) + except Exception as e: + util.SMlog('Failed to detach properly VDI {}: {}'.format(vdi_uuid, e)) + + +def inflate(journaler, linstor, vdi_uuid, vdi_path, new_size, old_size): + # Only inflate if the LINSTOR volume capacity is not enough. + new_size = LinstorVolumeManager.round_up_volume_size(new_size) + if new_size <= old_size: + return + + util.SMlog( + 'Inflate {} (size={}, previous={})' + .format(vdi_uuid, new_size, old_size) + ) + + journaler.create( + LinstorJournaler.INFLATE, vdi_uuid, old_size + ) + linstor.resize_volume(vdi_uuid, new_size) + + result_size = linstor.get_volume_size(vdi_uuid) + if result_size < new_size: + util.SMlog( + 'WARNING: Cannot inflate volume to {}B, result size: {}B' + .format(new_size, result_size) + ) + + if not util.zeroOut( + vdi_path, result_size - vhdutil.VHD_FOOTER_SIZE, + vhdutil.VHD_FOOTER_SIZE + ): + raise xs_errors.XenError( + 'EIO', + opterr='Failed to zero out VHD footer {}'.format(vdi_path) + ) + + LinstorVhdUtil(None, linstor).set_size_phys(vdi_path, result_size, False) + journaler.remove(LinstorJournaler.INFLATE, vdi_uuid) + + +def deflate(linstor, vdi_uuid, vdi_path, new_size, old_size): + new_size = LinstorVolumeManager.round_up_volume_size(new_size) + if new_size >= old_size: + return + + util.SMlog( + 'Deflate {} (new size={}, previous={})' + .format(vdi_uuid, new_size, old_size) + ) + + LinstorVhdUtil(None, linstor).set_size_phys(vdi_path, new_size) + # TODO: Change the LINSTOR volume size using linstor.resize_volume. + + +IPS_XHA_CACHE = None + + +def get_ips_from_xha_config_file(): + if IPS_XHA_CACHE: + return IPS_XHA_CACHE + + ips = dict() + host_id = None + try: + # Ensure there is no dirty read problem. + # For example if the HA is reloaded. + tree = util.retry( + lambda: xml_parser.parse(XHA_CONFIG_PATH), + maxretry=10, + period=1 + ) + except: + return (None, ips) + + def parse_host_nodes(ips, node): + current_id = None + current_ip = None + + for sub_node in node: + if sub_node.tag == 'IPaddress': + current_ip = sub_node.text + elif sub_node.tag == 'HostID': + current_id = sub_node.text + else: + continue + + if current_id and current_ip: + ips[current_id] = current_ip + return + util.SMlog('Ill-formed XHA file, missing IPaddress or/and HostID') + + def parse_common_config(ips, node): + for sub_node in node: + if sub_node.tag == 'host': + parse_host_nodes(ips, sub_node) + + def parse_local_config(ips, node): + for sub_node in node: + if sub_node.tag == 'localhost': + for host_node in sub_node: + if host_node.tag == 'HostID': + return host_node.text + + for node in tree.getroot(): + if node.tag == 'common-config': + parse_common_config(ips, node) + elif node.tag == 'local-config': + host_id = parse_local_config(ips, node) + else: + continue + + if ips and host_id: + break + + return (host_id and ips.get(host_id), ips) + + +def activate_lvm_group(group_name): + path = group_name.split('/') + assert path and len(path) <= 2 + try: + lvutil.setActiveVG(path[0], True) + except Exception as e: + util.SMlog('Cannot active VG `{}`: {}'.format(path[0], e)) + +# ============================================================================== + +# Usage example: +# xe sr-create type=linstor name-label=linstor-sr +# host-uuid=d2deba7a-c5ad-4de1-9a20-5c8df3343e93 +# device-config:group-name=vg_loop device-config:redundancy=2 + + +class LinstorSR(SR.SR): + DRIVER_TYPE = 'linstor' + + PROVISIONING_TYPES = ['thin', 'thick'] + PROVISIONING_DEFAULT = 'thin' + + MANAGER_PLUGIN = 'linstor-manager' + + INIT_STATUS_NOT_SET = 0 + INIT_STATUS_IN_PROGRESS = 1 + INIT_STATUS_OK = 2 + INIT_STATUS_FAIL = 3 + + # -------------------------------------------------------------------------- + # SR methods. + # -------------------------------------------------------------------------- + + @staticmethod + def handles(type): + return type == LinstorSR.DRIVER_TYPE + + def load(self, sr_uuid): + if not LINSTOR_AVAILABLE: + raise util.SMException( + 'Can\'t load LinstorSR: LINSTOR libraries are missing' + ) + + # Check parameters. + if 'group-name' not in self.dconf or not self.dconf['group-name']: + raise xs_errors.XenError('LinstorConfigGroupNameMissing') + if 'redundancy' not in self.dconf or not self.dconf['redundancy']: + raise xs_errors.XenError('LinstorConfigRedundancyMissing') + + self.driver_config = DRIVER_CONFIG + + # Check provisioning config. + provisioning = self.dconf.get('provisioning') + if provisioning: + if provisioning in self.PROVISIONING_TYPES: + self._provisioning = provisioning + else: + raise xs_errors.XenError( + 'InvalidArg', + opterr='Provisioning parameter must be one of {}'.format( + self.PROVISIONING_TYPES + ) + ) + else: + self._provisioning = self.PROVISIONING_DEFAULT + + monitor_db_quorum = self.dconf.get('monitor-db-quorum') + self._monitor_db_quorum = (monitor_db_quorum is None) or \ + distutils.util.strtobool(monitor_db_quorum) + + # Note: We don't have access to the session field if the + # 'vdi_attach_from_config' command is executed. + self._has_session = self.sr_ref and self.session is not None + if self._has_session: + self.sm_config = self.session.xenapi.SR.get_sm_config(self.sr_ref) + else: + self.sm_config = self.srcmd.params.get('sr_sm_config') or {} + + provisioning = self.sm_config.get('provisioning') + if provisioning in self.PROVISIONING_TYPES: + self._provisioning = provisioning + + # Define properties for SR parent class. + self.ops_exclusive = OPS_EXCLUSIVE + self.path = LinstorVolumeManager.DEV_ROOT_PATH + self.lock = Lock(vhdutil.LOCK_TYPE_SR, self.uuid) + self.sr_vditype = SR.DEFAULT_TAP + + if self.cmd == 'sr_create': + self._redundancy = int(self.dconf['redundancy']) or 1 + self._linstor = None # Ensure that LINSTOR attribute exists. + self._journaler = None + + self._is_master = False + if 'SRmaster' in self.dconf and self.dconf['SRmaster'] == 'true': + self._is_master = True + self._group_name = self.dconf['group-name'] + + self._vdi_shared_time = 0 + + self._init_status = self.INIT_STATUS_NOT_SET + + self._vdis_loaded = False + self._all_volume_info_cache = None + self._all_volume_metadata_cache = None + + def _locked_load(method): + def wrapped_method(self, *args, **kwargs): + self._init_status = self.INIT_STATUS_OK + return method(self, *args, **kwargs) + + def load(self, *args, **kwargs): + # Activate all LVMs to make drbd-reactor happy. + if self.srcmd.cmd == 'sr_attach': + activate_lvm_group(self._group_name) + + if not self._has_session: + if self.srcmd.cmd in ( + 'vdi_attach_from_config', + 'vdi_detach_from_config', + # When on-slave (is_open) is executed we have an + # empty command. + None + ): + def create_linstor(uri, attempt_count=30): + self._linstor = LinstorVolumeManager( + uri, + self._group_name, + logger=util.SMlog, + attempt_count=attempt_count + ) + # Only required if we are attaching from config using a non-special VDI. + # I.e. not an HA volume. + self._vhdutil = LinstorVhdUtil(self.session, self._linstor) + + controller_uri = get_controller_uri() + if controller_uri: + create_linstor(controller_uri) + else: + def connect(): + # We must have a valid LINSTOR instance here without using + # the XAPI. Fallback with the HA config file. + for ip in get_ips_from_xha_config_file()[1].values(): + controller_uri = 'linstor://' + ip + try: + util.SMlog('Connecting from config to LINSTOR controller using: {}'.format(ip)) + create_linstor(controller_uri, attempt_count=0) + return controller_uri + except: + pass + + controller_uri = util.retry(connect, maxretry=30, period=1) + if not controller_uri: + raise xs_errors.XenError( + 'SRUnavailable', + opterr='No valid controller URI to attach/detach from config' + ) + + self._journaler = LinstorJournaler( + controller_uri, self._group_name, logger=util.SMlog + ) + + if self.srcmd.cmd is None: + # Only useful on on-slave plugin (is_open). + self._vhdutil = LinstorVhdUtil(self.session, self._linstor) + + return wrapped_method(self, *args, **kwargs) + + if not self._is_master: + if self.cmd in [ + 'sr_create', 'sr_delete', 'sr_update', 'sr_probe', + 'sr_scan', 'vdi_create', 'vdi_delete', 'vdi_resize', + 'vdi_snapshot', 'vdi_clone' + ]: + util.SMlog('{} blocked for non-master'.format(self.cmd)) + raise xs_errors.XenError('LinstorMaster') + + # Because the LINSTOR KV objects cache all values, we must lock + # the VDI before the LinstorJournaler/LinstorVolumeManager + # instantiation and before any action on the master to avoid a + # bad read. The lock is also necessary to avoid strange + # behaviors if the GC is executed during an action on a slave. + if self.cmd.startswith('vdi_'): + self._shared_lock_vdi(self.srcmd.params['vdi_uuid']) + self._vdi_shared_time = time.time() + + if self.srcmd.cmd != 'sr_create' and self.srcmd.cmd != 'sr_detach': + try: + self._reconnect() + except Exception as e: + raise xs_errors.XenError('SRUnavailable', opterr=str(e)) + + if self._linstor: + try: + hosts = self._linstor.disconnected_hosts + except Exception as e: + raise xs_errors.XenError('SRUnavailable', opterr=str(e)) + + if hosts: + util.SMlog('Failed to join node(s): {}'.format(hosts)) + + # Ensure we use a non-locked volume when vhdutil is called. + if ( + self._is_master and self.cmd.startswith('vdi_') and + self.cmd != 'vdi_create' + ): + self._linstor.ensure_volume_is_not_locked( + self.srcmd.params['vdi_uuid'] + ) + + try: + # If the command is a SR scan command on the master, + # we must load all VDIs and clean journal transactions. + # We must load the VDIs in the snapshot case too only if + # there is at least one entry in the journal. + # + # If the command is a SR command we want at least to remove + # resourceless volumes. + if self._is_master and self.cmd not in [ + 'vdi_attach', 'vdi_detach', + 'vdi_activate', 'vdi_deactivate', + 'vdi_epoch_begin', 'vdi_epoch_end', + 'vdi_update', 'vdi_destroy' + ]: + load_vdis = ( + self.cmd == 'sr_scan' or + self.cmd == 'sr_attach' + ) or len( + self._journaler.get_all(LinstorJournaler.INFLATE) + ) or len( + self._journaler.get_all(LinstorJournaler.CLONE) + ) + + if load_vdis: + self._load_vdis() + + self._linstor.remove_resourceless_volumes() + + self._synchronize_metadata() + except Exception as e: + if self.cmd == 'sr_scan' or self.cmd == 'sr_attach': + # Always raise, we don't want to remove VDIs + # from the XAPI database otherwise. + raise e + util.SMlog( + 'Ignoring exception in LinstorSR.load: {}'.format(e) + ) + util.SMlog(traceback.format_exc()) + + return wrapped_method(self, *args, **kwargs) + + @functools.wraps(wrapped_method) + def wrap(self, *args, **kwargs): + if self._init_status in \ + (self.INIT_STATUS_OK, self.INIT_STATUS_IN_PROGRESS): + return wrapped_method(self, *args, **kwargs) + if self._init_status == self.INIT_STATUS_FAIL: + util.SMlog( + 'Can\'t call method {} because initialization failed' + .format(method) + ) + else: + try: + self._init_status = self.INIT_STATUS_IN_PROGRESS + return load(self, *args, **kwargs) + except Exception: + if self._init_status != self.INIT_STATUS_OK: + self._init_status = self.INIT_STATUS_FAIL + raise + + return wrap + + def cleanup(self): + if self._vdi_shared_time: + self._shared_lock_vdi(self.srcmd.params['vdi_uuid'], locked=False) + + @_locked_load + def create(self, uuid, size): + util.SMlog('LinstorSR.create for {}'.format(self.uuid)) + + host_adresses = util.get_host_addresses(self.session) + if self._redundancy > len(host_adresses): + raise xs_errors.XenError( + 'LinstorSRCreate', + opterr='Redundancy greater than host count' + ) + + xenapi = self.session.xenapi + srs = xenapi.SR.get_all_records_where( + 'field "type" = "{}"'.format(self.DRIVER_TYPE) + ) + srs = dict(filter(lambda e: e[1]['uuid'] != self.uuid, srs.items())) + + for sr in srs.values(): + for pbd in sr['PBDs']: + device_config = xenapi.PBD.get_device_config(pbd) + group_name = device_config.get('group-name') + if group_name and group_name == self._group_name: + raise xs_errors.XenError( + 'LinstorSRCreate', + opterr='group name must be unique' + ) + + if srs: + raise xs_errors.XenError( + 'LinstorSRCreate', + opterr='LINSTOR SR must be unique in a pool' + ) + + online_hosts = util.get_online_hosts(self.session) + if len(online_hosts) < len(host_adresses): + raise xs_errors.XenError( + 'LinstorSRCreate', + opterr='Not enough online hosts' + ) + + ips = {} + for host_ref in online_hosts: + record = self.session.xenapi.host.get_record(host_ref) + hostname = record['hostname'] + ips[hostname] = record['address'] + + if len(ips) != len(online_hosts): + raise xs_errors.XenError( + 'LinstorSRCreate', + opterr='Multiple hosts with same hostname' + ) + + # Ensure ports are opened and LINSTOR satellites + # are activated. In the same time the drbd-reactor instances + # must be stopped. + self._prepare_sr_on_all_hosts(self._group_name, enabled=True) + + # Create SR. + # Throw if the SR already exists. + try: + self._linstor = LinstorVolumeManager.create_sr( + self._group_name, + ips, + self._redundancy, + thin_provisioning=self._provisioning == 'thin', + auto_quorum=self._monitor_db_quorum, + logger=util.SMlog + ) + self._vhdutil = LinstorVhdUtil(self.session, self._linstor) + except Exception as e: + util.SMlog('Failed to create LINSTOR SR: {}'.format(e)) + raise xs_errors.XenError('LinstorSRCreate', opterr=str(e)) + + try: + util.SMlog( + "Finishing SR creation, enable drbd-reactor on all hosts..." + ) + self._update_drbd_reactor_on_all_hosts(enabled=True) + except Exception as e: + try: + self._linstor.destroy() + except Exception as e2: + util.SMlog( + 'Failed to destroy LINSTOR SR after creation fail: {}' + .format(e2) + ) + raise e + + @_locked_load + def delete(self, uuid): + util.SMlog('LinstorSR.delete for {}'.format(self.uuid)) + cleanup.gc_force(self.session, self.uuid) + + if self.vdis or self._linstor._volumes: + raise xs_errors.XenError('SRNotEmpty') + + node_name = get_controller_node_name() + if not node_name: + raise xs_errors.XenError( + 'LinstorSRDelete', + opterr='Cannot get controller node name' + ) + + host = None + if node_name == 'localhost': + host = util.get_this_host_ref(self.session) + else: + for slave in util.get_all_slaves(self.session): + r_name = self.session.xenapi.host.get_record(slave)['hostname'] + if r_name == node_name: + host = slave + break + + if not host: + raise xs_errors.XenError( + 'LinstorSRDelete', + opterr='Failed to find host with hostname: {}'.format( + node_name + ) + ) + + try: + self._update_drbd_reactor_on_all_hosts( + controller_node_name=node_name, enabled=False + ) + + args = { + 'groupName': self._group_name, + } + self._exec_manager_command( + host, 'destroy', args, 'LinstorSRDelete' + ) + except Exception as e: + try: + self._update_drbd_reactor_on_all_hosts( + controller_node_name=node_name, enabled=True + ) + except Exception as e2: + util.SMlog( + 'Failed to restart drbd-reactor after destroy fail: {}' + .format(e2) + ) + util.SMlog('Failed to delete LINSTOR SR: {}'.format(e)) + raise xs_errors.XenError( + 'LinstorSRDelete', + opterr=str(e) + ) + + Lock.cleanupAll(self.uuid) + + @_locked_load + def update(self, uuid): + util.SMlog('LinstorSR.update for {}'.format(self.uuid)) + + # Well, how can we update a SR if it doesn't exist? :thinking: + if not self._linstor: + raise xs_errors.XenError( + 'SRUnavailable', + opterr='no such volume group: {}'.format(self._group_name) + ) + + self._update_stats(0) + + # Update the SR name and description only in LINSTOR metadata. + xenapi = self.session.xenapi + self._linstor.metadata = { + NAME_LABEL_TAG: util.to_plain_string( + xenapi.SR.get_name_label(self.sr_ref) + ), + NAME_DESCRIPTION_TAG: util.to_plain_string( + xenapi.SR.get_name_description(self.sr_ref) + ) + } + + @_locked_load + def attach(self, uuid): + util.SMlog('LinstorSR.attach for {}'.format(self.uuid)) + + if not self._linstor: + raise xs_errors.XenError( + 'SRUnavailable', + opterr='no such group: {}'.format(self._group_name) + ) + + @_locked_load + def detach(self, uuid): + util.SMlog('LinstorSR.detach for {}'.format(self.uuid)) + cleanup.abort(self.uuid) + + @_locked_load + def probe(self): + util.SMlog('LinstorSR.probe for {}'.format(self.uuid)) + # TODO + + @_locked_load + def scan(self, uuid): + if self._init_status == self.INIT_STATUS_FAIL: + return + + util.SMlog('LinstorSR.scan for {}'.format(self.uuid)) + if not self._linstor: + raise xs_errors.XenError( + 'SRUnavailable', + opterr='no such volume group: {}'.format(self._group_name) + ) + + # Note: `scan` can be called outside this module, so ensure the VDIs + # are loaded. + self._load_vdis() + self._update_physical_size() + + for vdi_uuid in self.vdis.keys(): + if self.vdis[vdi_uuid].deleted: + del self.vdis[vdi_uuid] + + # Security to prevent VDIs from being forgotten if the controller + # is started without a shared and mounted /var/lib/linstor path. + try: + self._linstor.get_database_path() + except Exception: + # Failed to get database path, ensure we don't have + # VDIs in the XAPI database... + if self.session.xenapi.SR.get_VDIs( + self.session.xenapi.SR.get_by_uuid(self.uuid) + ): + raise xs_errors.XenError( + 'SRUnavailable', + opterr='Database is not mounted' + ) + + # Update the database before the restart of the GC to avoid + # bad sync in the process if new VDIs have been introduced. + ret = super(LinstorSR, self).scan(self.uuid) + self._kick_gc() + return ret + + @_locked_load + def vdi(self, uuid): + return LinstorVDI(self, uuid) + + _locked_load = staticmethod(_locked_load) + + # -------------------------------------------------------------------------- + # Lock. + # -------------------------------------------------------------------------- + + def _shared_lock_vdi(self, vdi_uuid, locked=True): + master = util.get_master_ref(self.session) + + command = 'lockVdi' + args = { + 'groupName': self._group_name, + 'srUuid': self.uuid, + 'vdiUuid': vdi_uuid, + 'locked': str(locked) + } + + # Note: We must avoid to unlock the volume if the timeout is reached + # because during volume unlock, the SR lock is not used. Otherwise + # we could destroy a valid lock acquired from another host... + # + # This code is not very clean, the ideal solution would be to acquire + # the SR lock during volume unlock (like lock) but it's not easy + # to implement without impacting performance. + if not locked: + elapsed_time = time.time() - self._vdi_shared_time + timeout = LinstorVolumeManager.LOCKED_EXPIRATION_DELAY * 0.7 + if elapsed_time >= timeout: + util.SMlog( + 'Avoid unlock call of {} because timeout has been reached' + .format(vdi_uuid) + ) + return + + self._exec_manager_command(master, command, args, 'VDIUnavailable') + + # -------------------------------------------------------------------------- + # Network. + # -------------------------------------------------------------------------- + + def _exec_manager_command(self, host_ref, command, args, error): + host_rec = self.session.xenapi.host.get_record(host_ref) + host_uuid = host_rec['uuid'] + + try: + ret = self.session.xenapi.host.call_plugin( + host_ref, self.MANAGER_PLUGIN, command, args + ) + except Exception as e: + util.SMlog( + 'call-plugin on {} ({}:{} with {}) raised'.format( + host_uuid, self.MANAGER_PLUGIN, command, args + ) + ) + raise e + + util.SMlog( + 'call-plugin on {} ({}:{} with {}) returned: {}'.format( + host_uuid, self.MANAGER_PLUGIN, command, args, ret + ) + ) + if ret == 'False': + raise xs_errors.XenError( + error, + opterr='Plugin {} failed'.format(self.MANAGER_PLUGIN) + ) + + def _prepare_sr(self, host, group_name, enabled): + self._exec_manager_command( + host, + 'prepareSr' if enabled else 'releaseSr', + {'groupName': group_name}, + 'SRUnavailable' + ) + + def _prepare_sr_on_all_hosts(self, group_name, enabled): + master = util.get_master_ref(self.session) + self._prepare_sr(master, group_name, enabled) + + for slave in util.get_all_slaves(self.session): + self._prepare_sr(slave, group_name, enabled) + + def _update_drbd_reactor(self, host, enabled): + self._exec_manager_command( + host, + 'updateDrbdReactor', + {'enabled': str(enabled)}, + 'SRUnavailable' + ) + + def _update_drbd_reactor_on_all_hosts( + self, enabled, controller_node_name=None + ): + if controller_node_name == 'localhost': + controller_node_name = self.session.xenapi.host.get_record( + util.get_this_host_ref(self.session) + )['hostname'] + assert controller_node_name + assert controller_node_name != 'localhost' + + controller_host = None + secondary_hosts = [] + + hosts = self.session.xenapi.host.get_all_records() + for host_ref, host_rec in hosts.iteritems(): + hostname = host_rec['hostname'] + if controller_node_name == hostname: + controller_host = host_ref + else: + secondary_hosts.append((host_ref, hostname)) + + action_name = 'Starting' if enabled else 'Stopping' + if controller_node_name and not controller_host: + util.SMlog('Failed to find controller host: `{}`'.format( + controller_node_name + )) + + if enabled and controller_host: + util.SMlog('{} drbd-reactor on controller host `{}`...'.format( + action_name, controller_node_name + )) + # If enabled is true, we try to start the controller on the desired + # node name first. + self._update_drbd_reactor(controller_host, enabled) + + for host_ref, hostname in secondary_hosts: + util.SMlog('{} drbd-reactor on host {}...'.format( + action_name, hostname + )) + self._update_drbd_reactor(host_ref, enabled) + + if not enabled and controller_host: + util.SMlog('{} drbd-reactor on controller host `{}`...'.format( + action_name, controller_node_name + )) + # If enabled is false, we disable the drbd-reactor service of + # the controller host last. Why? Otherwise the linstor-controller + # of other nodes can be started, and we don't want that. + self._update_drbd_reactor(controller_host, enabled) + + # -------------------------------------------------------------------------- + # Metadata. + # -------------------------------------------------------------------------- + + def _synchronize_metadata_and_xapi(self): + try: + # First synch SR parameters. + self.update(self.uuid) + + # Now update the VDI information in the metadata if required. + xenapi = self.session.xenapi + volumes_metadata = self._linstor.get_volumes_with_metadata() + for vdi_uuid, volume_metadata in volumes_metadata.items(): + try: + vdi_ref = xenapi.VDI.get_by_uuid(vdi_uuid) + except Exception: + # May be the VDI is not in XAPI yet dont bother. + continue + + label = util.to_plain_string( + xenapi.VDI.get_name_label(vdi_ref) + ) + description = util.to_plain_string( + xenapi.VDI.get_name_description(vdi_ref) + ) + + if ( + volume_metadata.get(NAME_LABEL_TAG) != label or + volume_metadata.get(NAME_DESCRIPTION_TAG) != description + ): + self._linstor.update_volume_metadata(vdi_uuid, { + NAME_LABEL_TAG: label, + NAME_DESCRIPTION_TAG: description + }) + except Exception as e: + raise xs_errors.XenError( + 'MetadataError', + opterr='Error synching SR Metadata and XAPI: {}'.format(e) + ) + + def _synchronize_metadata(self): + if not self._is_master: + return + + util.SMlog('Synchronize metadata...') + if self.cmd == 'sr_attach': + try: + util.SMlog( + 'Synchronize SR metadata and the state on the storage.' + ) + self._synchronize_metadata_and_xapi() + except Exception as e: + util.SMlog('Failed to synchronize metadata: {}'.format(e)) + + # -------------------------------------------------------------------------- + # Stats. + # -------------------------------------------------------------------------- + + def _update_stats(self, virt_alloc_delta): + valloc = int(self.session.xenapi.SR.get_virtual_allocation( + self.sr_ref + )) + + # Update size attributes of the SR parent class. + self.virtual_allocation = valloc + virt_alloc_delta + + self._update_physical_size() + + # Notify SR parent class. + self._db_update() + + def _update_physical_size(self): + # We use the size of the smallest disk, this is an approximation that + # ensures the displayed physical size is reachable by the user. + (min_physical_size, pool_count) = self._linstor.get_min_physical_size() + self.physical_size = min_physical_size * pool_count / \ + self._linstor.redundancy + + self.physical_utilisation = self._linstor.allocated_volume_size + + # -------------------------------------------------------------------------- + # VDIs. + # -------------------------------------------------------------------------- + + def _load_vdis(self): + if self._vdis_loaded: + return + + assert self._is_master + + # We use a cache to avoid repeated JSON parsing. + # The performance gain is not big but we can still + # enjoy it with a few lines. + self._create_linstor_cache() + self._load_vdis_ex() + self._destroy_linstor_cache() + + # We must mark VDIs as loaded only if the load is a success. + self._vdis_loaded = True + + self._undo_all_journal_transactions() + + def _load_vdis_ex(self): + # 1. Get existing VDIs in XAPI. + xenapi = self.session.xenapi + xapi_vdi_uuids = set() + for vdi in xenapi.SR.get_VDIs(self.sr_ref): + xapi_vdi_uuids.add(xenapi.VDI.get_uuid(vdi)) + + # 2. Get volumes info. + all_volume_info = self._all_volume_info_cache + volumes_metadata = self._all_volume_metadata_cache + + # 3. Get CBT vdis. + # See: https://support.citrix.com/article/CTX230619 + cbt_vdis = set() + for volume_metadata in volumes_metadata.values(): + cbt_uuid = volume_metadata.get(CBTLOG_TAG) + if cbt_uuid: + cbt_vdis.add(cbt_uuid) + + introduce = False + + # Try to introduce VDIs only during scan/attach. + if self.cmd == 'sr_scan' or self.cmd == 'sr_attach': + has_clone_entries = list(self._journaler.get_all( + LinstorJournaler.CLONE + ).items()) + + if has_clone_entries: + util.SMlog( + 'Cannot introduce VDIs during scan because it exists ' + 'CLONE entries in journaler on SR {}'.format(self.uuid) + ) + else: + introduce = True + + # 4. Now check all volume info. + vdi_to_snaps = {} + for vdi_uuid, volume_info in all_volume_info.items(): + if vdi_uuid.startswith(cleanup.SR.TMP_RENAME_PREFIX): + continue + + # 4.a. Check if the VDI in LINSTOR is in XAPI VDIs. + if vdi_uuid not in xapi_vdi_uuids: + if not introduce: + continue + + if vdi_uuid.startswith('DELETED_'): + continue + + volume_metadata = volumes_metadata.get(vdi_uuid) + if not volume_metadata: + util.SMlog( + 'Skipping volume {} because no metadata could be found' + .format(vdi_uuid) + ) + continue + + util.SMlog( + 'Trying to introduce VDI {} as it is present in ' + 'LINSTOR and not in XAPI...' + .format(vdi_uuid) + ) + + try: + self._linstor.get_device_path(vdi_uuid) + except Exception as e: + util.SMlog( + 'Cannot introduce {}, unable to get path: {}' + .format(vdi_uuid, e) + ) + continue + + name_label = volume_metadata.get(NAME_LABEL_TAG) or '' + type = volume_metadata.get(TYPE_TAG) or 'user' + vdi_type = volume_metadata.get(VDI_TYPE_TAG) + + if not vdi_type: + util.SMlog( + 'Cannot introduce {} '.format(vdi_uuid) + + 'without vdi_type' + ) + continue + + sm_config = { + 'vdi_type': vdi_type + } + + if vdi_type == vhdutil.VDI_TYPE_RAW: + managed = not volume_metadata.get(HIDDEN_TAG) + elif vdi_type == vhdutil.VDI_TYPE_VHD: + vhd_info = self._vhdutil.get_vhd_info(vdi_uuid) + managed = not vhd_info.hidden + if vhd_info.parentUuid: + sm_config['vhd-parent'] = vhd_info.parentUuid + else: + util.SMlog( + 'Cannot introduce {} with invalid VDI type {}' + .format(vdi_uuid, vdi_type) + ) + continue + + util.SMlog( + 'Introducing VDI {} '.format(vdi_uuid) + + ' (name={}, virtual_size={}, allocated_size={})'.format( + name_label, + volume_info.virtual_size, + volume_info.allocated_size + ) + ) + + vdi_ref = xenapi.VDI.db_introduce( + vdi_uuid, + name_label, + volume_metadata.get(NAME_DESCRIPTION_TAG) or '', + self.sr_ref, + type, + False, # sharable + bool(volume_metadata.get(READ_ONLY_TAG)), + {}, # other_config + vdi_uuid, # location + {}, # xenstore_data + sm_config, + managed, + str(volume_info.virtual_size), + str(volume_info.allocated_size) + ) + + is_a_snapshot = volume_metadata.get(IS_A_SNAPSHOT_TAG) + xenapi.VDI.set_is_a_snapshot(vdi_ref, bool(is_a_snapshot)) + if is_a_snapshot: + xenapi.VDI.set_snapshot_time( + vdi_ref, + xmlrpclib.DateTime( + volume_metadata[SNAPSHOT_TIME_TAG] or + '19700101T00:00:00Z' + ) + ) + + snap_uuid = volume_metadata[SNAPSHOT_OF_TAG] + if snap_uuid in vdi_to_snaps: + vdi_to_snaps[snap_uuid].append(vdi_uuid) + else: + vdi_to_snaps[snap_uuid] = [vdi_uuid] + + # 4.b. Add the VDI in the list. + vdi = self.vdi(vdi_uuid) + self.vdis[vdi_uuid] = vdi + + if USE_KEY_HASH and vdi.vdi_type == vhdutil.VDI_TYPE_VHD: + # TODO: Replace pylint comment with this feature when possible: + # https://github.com/PyCQA/pylint/pull/2926 + vdi.sm_config_override['key_hash'] = \ + self._vhdutil.get_key_hash(vdi_uuid) # pylint: disable = E1120 + + # 4.c. Update CBT status of disks either just added + # or already in XAPI. + cbt_uuid = volume_metadata.get(CBTLOG_TAG) + if cbt_uuid in cbt_vdis: + vdi_ref = xenapi.VDI.get_by_uuid(vdi_uuid) + xenapi.VDI.set_cbt_enabled(vdi_ref, True) + # For existing VDIs, update local state too. + # Scan in base class SR updates existing VDIs + # again based on local states. + self.vdis[vdi_uuid].cbt_enabled = True + cbt_vdis.remove(cbt_uuid) + + # 5. Now set the snapshot statuses correctly in XAPI. + for src_uuid in vdi_to_snaps: + try: + src_ref = xenapi.VDI.get_by_uuid(src_uuid) + except Exception: + # The source VDI no longer exists, continue. + continue + + for snap_uuid in vdi_to_snaps[src_uuid]: + try: + # This might fail in cases where its already set. + snap_ref = xenapi.VDI.get_by_uuid(snap_uuid) + xenapi.VDI.set_snapshot_of(snap_ref, src_ref) + except Exception as e: + util.SMlog('Setting snapshot failed: {}'.format(e)) + + # TODO: Check correctly how to use CBT. + # Update cbt_enabled on the right VDI, check LVM/FileSR code. + + # 6. If we have items remaining in this list, + # they are cbt_metadata VDI that XAPI doesn't know about. + # Add them to self.vdis and they'll get added to the DB. + for cbt_uuid in cbt_vdis: + new_vdi = self.vdi(cbt_uuid) + new_vdi.ty = 'cbt_metadata' + new_vdi.cbt_enabled = True + self.vdis[cbt_uuid] = new_vdi + + # 7. Update virtual allocation, build geneology and remove useless VDIs + self.virtual_allocation = 0 + + # 8. Build geneology. + geneology = {} + + for vdi_uuid, vdi in self.vdis.items(): + if vdi.parent: + if vdi.parent in self.vdis: + self.vdis[vdi.parent].read_only = True + if vdi.parent in geneology: + geneology[vdi.parent].append(vdi_uuid) + else: + geneology[vdi.parent] = [vdi_uuid] + if not vdi.hidden: + self.virtual_allocation += vdi.size + + # 9. Remove all hidden leaf nodes to avoid introducing records that + # will be GC'ed. + for vdi_uuid in self.vdis.keys(): + if vdi_uuid not in geneology and self.vdis[vdi_uuid].hidden: + util.SMlog( + 'Scan found hidden leaf ({}), ignoring'.format(vdi_uuid) + ) + del self.vdis[vdi_uuid] + + # -------------------------------------------------------------------------- + # Journals. + # -------------------------------------------------------------------------- + + def _get_vdi_path_and_parent(self, vdi_uuid, volume_name): + try: + device_path = self._linstor.build_device_path(volume_name) + if not util.pathexists(device_path): + return (None, None) + + # If it's a RAW VDI, there is no parent. + volume_metadata = self._linstor.get_volume_metadata(vdi_uuid) + vdi_type = volume_metadata[VDI_TYPE_TAG] + if vdi_type == vhdutil.VDI_TYPE_RAW: + return (device_path, None) + + # Otherwise it's a VHD and a parent can exist. + if not self._vhdutil.check(vdi_uuid): + return (None, None) + + vhd_info = self._vhdutil.get_vhd_info(vdi_uuid) + if vhd_info: + return (device_path, vhd_info.parentUuid) + except Exception as e: + util.SMlog( + 'Failed to get VDI path and parent, ignoring: {}' + .format(e) + ) + return (None, None) + + def _undo_all_journal_transactions(self): + util.SMlog('Undoing all journal transactions...') + self.lock.acquire() + try: + self._handle_interrupted_inflate_ops() + self._handle_interrupted_clone_ops() + pass + finally: + self.lock.release() + + def _handle_interrupted_inflate_ops(self): + transactions = self._journaler.get_all(LinstorJournaler.INFLATE) + for vdi_uuid, old_size in transactions.items(): + self._handle_interrupted_inflate(vdi_uuid, old_size) + self._journaler.remove(LinstorJournaler.INFLATE, vdi_uuid) + + def _handle_interrupted_clone_ops(self): + transactions = self._journaler.get_all(LinstorJournaler.CLONE) + for vdi_uuid, old_size in transactions.items(): + self._handle_interrupted_clone(vdi_uuid, old_size) + self._journaler.remove(LinstorJournaler.CLONE, vdi_uuid) + + def _handle_interrupted_inflate(self, vdi_uuid, old_size): + util.SMlog( + '*** INTERRUPTED INFLATE OP: for {} ({})' + .format(vdi_uuid, old_size) + ) + + vdi = self.vdis.get(vdi_uuid) + if not vdi: + util.SMlog('Cannot deflate missing VDI {}'.format(vdi_uuid)) + return + + assert not self._all_volume_info_cache + volume_info = self._linstor.get_volume_info(vdi_uuid) + + current_size = volume_info.virtual_size + assert current_size > 0 + + util.zeroOut( + vdi.path, + current_size - vhdutil.VHD_FOOTER_SIZE, + vhdutil.VHD_FOOTER_SIZE + ) + deflate(self._linstor, vdi_uuid, vdi.path, old_size, current_size) + + def _handle_interrupted_clone( + self, vdi_uuid, clone_info, force_undo=False + ): + util.SMlog( + '*** INTERRUPTED CLONE OP: for {} ({})' + .format(vdi_uuid, clone_info) + ) + + base_uuid, snap_uuid = clone_info.split('_') + + # Use LINSTOR data because new VDIs may not be in the XAPI. + volume_names = self._linstor.get_volumes_with_name() + + # Check if we don't have a base VDI. (If clone failed at startup.) + if base_uuid not in volume_names: + if vdi_uuid in volume_names: + util.SMlog('*** INTERRUPTED CLONE OP: nothing to do') + return + raise util.SMException( + 'Base copy {} not present, but no original {} found' + .format(base_uuid, vdi_uuid) + ) + + if force_undo: + util.SMlog('Explicit revert') + self._undo_clone( + volume_names, vdi_uuid, base_uuid, snap_uuid + ) + return + + # If VDI or snap uuid is missing... + if vdi_uuid not in volume_names or \ + (snap_uuid and snap_uuid not in volume_names): + util.SMlog('One or both leaves missing => revert') + self._undo_clone(volume_names, vdi_uuid, base_uuid, snap_uuid) + return + + vdi_path, vdi_parent_uuid = self._get_vdi_path_and_parent( + vdi_uuid, volume_names[vdi_uuid] + ) + snap_path, snap_parent_uuid = self._get_vdi_path_and_parent( + snap_uuid, volume_names[snap_uuid] + ) + + if not vdi_path or (snap_uuid and not snap_path): + util.SMlog('One or both leaves invalid (and path(s)) => revert') + self._undo_clone(volume_names, vdi_uuid, base_uuid, snap_uuid) + return + + util.SMlog('Leaves valid but => revert') + self._undo_clone(volume_names, vdi_uuid, base_uuid, snap_uuid) + + def _undo_clone(self, volume_names, vdi_uuid, base_uuid, snap_uuid): + base_path = self._linstor.build_device_path(volume_names[base_uuid]) + base_metadata = self._linstor.get_volume_metadata(base_uuid) + base_type = base_metadata[VDI_TYPE_TAG] + + if not util.pathexists(base_path): + util.SMlog('Base not found! Exit...') + util.SMlog('*** INTERRUPTED CLONE OP: rollback fail') + return + + # Un-hide the parent. + self._linstor.update_volume_metadata(base_uuid, {READ_ONLY_TAG: False}) + if base_type == vhdutil.VDI_TYPE_VHD: + vhd_info = self._vhdutil.get_vhd_info(base_uuid, False) + if vhd_info.hidden: + self._vhdutil.set_hidden(base_path, False) + elif base_type == vhdutil.VDI_TYPE_RAW and \ + base_metadata.get(HIDDEN_TAG): + self._linstor.update_volume_metadata( + base_uuid, {HIDDEN_TAG: False} + ) + + # Remove the child nodes. + if snap_uuid and snap_uuid in volume_names: + util.SMlog('Destroying snap {}...'.format(snap_uuid)) + + try: + self._linstor.destroy_volume(snap_uuid) + except Exception as e: + util.SMlog( + 'Cannot destroy snap {} during undo clone: {}' + .format(snap_uuid, e) + ) + + if vdi_uuid in volume_names: + try: + util.SMlog('Destroying {}...'.format(vdi_uuid)) + self._linstor.destroy_volume(vdi_uuid) + except Exception as e: + util.SMlog( + 'Cannot destroy VDI {} during undo clone: {}' + .format(vdi_uuid, e) + ) + # We can get an exception like this: + # "Shutdown of the DRBD resource 'XXX failed", so the + # volume info remains... The problem is we can't rename + # properly the base VDI below this line, so we must change the + # UUID of this bad VDI before. + self._linstor.update_volume_uuid( + vdi_uuid, 'DELETED_' + vdi_uuid, force=True + ) + + # Rename! + self._linstor.update_volume_uuid(base_uuid, vdi_uuid) + + # Inflate to the right size. + if base_type == vhdutil.VDI_TYPE_VHD: + vdi = self.vdi(vdi_uuid) + volume_size = compute_volume_size(vdi.size, vdi.vdi_type) + inflate( + self._journaler, self._linstor, vdi_uuid, vdi.path, + volume_size, vdi.capacity + ) + self.vdis[vdi_uuid] = vdi + + # At this stage, tapdisk and SM vdi will be in paused state. Remove + # flag to facilitate vm deactivate. + vdi_ref = self.session.xenapi.VDI.get_by_uuid(vdi_uuid) + self.session.xenapi.VDI.remove_from_sm_config(vdi_ref, 'paused') + + util.SMlog('*** INTERRUPTED CLONE OP: rollback success') + + # -------------------------------------------------------------------------- + # Cache. + # -------------------------------------------------------------------------- + + def _create_linstor_cache(self): + # TODO: use a nonlocal with python3. + class context: + reconnect = False + + def create_cache(): + try: + if context.reconnect: + self._reconnect() + return self._linstor.get_volumes_with_info() + except Exception as e: + context.reconnect = True + raise e + + self._all_volume_metadata_cache = \ + self._linstor.get_volumes_with_metadata() + self._all_volume_info_cache = util.retry( + create_cache, + maxretry=10, + period=3 + ) + + def _destroy_linstor_cache(self): + self._all_volume_info_cache = None + self._all_volume_metadata_cache = None + + # -------------------------------------------------------------------------- + # Misc. + # -------------------------------------------------------------------------- + + def _reconnect(self): + controller_uri = get_controller_uri() + + self._journaler = LinstorJournaler( + controller_uri, self._group_name, logger=util.SMlog + ) + + # Try to open SR if exists. + # We can repair only if we are on the master AND if + # we are trying to execute an exclusive operation. + # Otherwise we could try to delete a VDI being created or + # during a snapshot. An exclusive op is the guarantee that + # the SR is locked. + self._linstor = LinstorVolumeManager( + controller_uri, + self._group_name, + repair=( + self._is_master and + self.srcmd.cmd in self.ops_exclusive + ), + logger=util.SMlog + ) + self._vhdutil = LinstorVhdUtil(self.session, self._linstor) + + def _ensure_space_available(self, amount_needed): + space_available = self._linstor.max_volume_size_allowed + if (space_available < amount_needed): + util.SMlog( + 'Not enough space! Free space: {}, need: {}'.format( + space_available, amount_needed + ) + ) + raise xs_errors.XenError('SRNoSpace') + + def _kick_gc(self): + # Don't bother if an instance already running. This is just an + # optimization to reduce the overhead of forking a new process if we + # don't have to, but the process will check the lock anyways. + lock = Lock(cleanup.LOCK_TYPE_RUNNING, self.uuid) + if not lock.acquireNoblock(): + if not cleanup.should_preempt(self.session, self.uuid): + util.SMlog('A GC instance already running, not kicking') + return + + util.SMlog('Aborting currently-running coalesce of garbage VDI') + try: + if not cleanup.abort(self.uuid, soft=True): + util.SMlog('The GC has already been scheduled to re-start') + except util.CommandException as e: + if e.code != errno.ETIMEDOUT: + raise + util.SMlog('Failed to abort the GC') + else: + lock.release() + + util.SMlog('Kicking GC') + cleanup.gc(self.session, self.uuid, True) + +# ============================================================================== +# LinstorSr VDI +# ============================================================================== + + +class LinstorVDI(VDI.VDI): + # Warning: Not the same values than vhdutil.VDI_TYPE_*. + # These values represents the types given on the command line. + TYPE_RAW = 'raw' + TYPE_VHD = 'vhd' + + MAX_SIZE = 2 * 1024 * 1024 * 1024 * 1024 # Max VHD size. + + # Metadata size given to the "S" param of vhd-util create. + # "-S size (MB) for metadata preallocation". + # Increase the performance when resize is called. + MAX_METADATA_VIRT_SIZE = 2 * 1024 * 1024 + + # -------------------------------------------------------------------------- + # VDI methods. + # -------------------------------------------------------------------------- + + def load(self, vdi_uuid): + self._lock = self.sr.lock + self._exists = True + self._linstor = self.sr._linstor + + # Update hidden parent property. + self.hidden = False + + def raise_bad_load(e): + util.SMlog( + 'Got exception in LinstorVDI.load: {}'.format(e) + ) + util.SMlog(traceback.format_exc()) + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='Could not load {} because: {}'.format(self.uuid, e) + ) + + # Try to load VDI. + try: + if ( + self.sr.srcmd.cmd == 'vdi_attach_from_config' or + self.sr.srcmd.cmd == 'vdi_detach_from_config' + ): + self.vdi_type = vhdutil.VDI_TYPE_RAW + self.path = self.sr.srcmd.params['vdi_path'] + else: + self._determine_type_and_path() + self._load_this() + + util.SMlog('VDI {} loaded! (path={}, hidden={})'.format( + self.uuid, self.path, self.hidden + )) + except LinstorVolumeManagerError as e: + # 1. It may be a VDI deletion. + if e.code == LinstorVolumeManagerError.ERR_VOLUME_NOT_EXISTS: + if self.sr.srcmd.cmd == 'vdi_delete': + self.deleted = True + return + + # 2. Or maybe a creation. + if self.sr.srcmd.cmd == 'vdi_create': + # Set type attribute of VDI parent class. + # We use VHD by default. + self.vdi_type = vhdutil.VDI_TYPE_VHD + self._key_hash = None # Only used in create. + + self._exists = False + vdi_sm_config = self.sr.srcmd.params.get('vdi_sm_config') + if vdi_sm_config is not None: + type = vdi_sm_config.get('type') + if type is not None: + if type == self.TYPE_RAW: + self.vdi_type = vhdutil.VDI_TYPE_RAW + elif type == self.TYPE_VHD: + self.vdi_type = vhdutil.VDI_TYPE_VHD + else: + raise xs_errors.XenError( + 'VDICreate', + opterr='Invalid VDI type {}'.format(type) + ) + if self.vdi_type == vhdutil.VDI_TYPE_VHD: + self._key_hash = vdi_sm_config.get('key_hash') + + # For the moment we don't have a path. + self._update_device_name(None) + return + raise_bad_load(e) + except Exception as e: + raise_bad_load(e) + + def create(self, sr_uuid, vdi_uuid, size): + # Usage example: + # xe vdi-create sr-uuid=39a5826b-5a90-73eb-dd09-51e3a116f937 + # name-label="linstor-vdi-1" virtual-size=4096MiB sm-config:type=vhd + + # 1. Check if we are on the master and if the VDI doesn't exist. + util.SMlog('LinstorVDI.create for {}'.format(self.uuid)) + if self._exists: + raise xs_errors.XenError('VDIExists') + + assert self.uuid + assert self.ty + assert self.vdi_type + + # 2. Compute size and check space available. + size = vhdutil.validate_and_round_vhd_size(long(size)) + volume_size = compute_volume_size(size, self.vdi_type) + util.SMlog( + 'LinstorVDI.create: type={}, vhd-size={}, volume-size={}' + .format(self.vdi_type, size, volume_size) + ) + self.sr._ensure_space_available(volume_size) + + # 3. Set sm_config attribute of VDI parent class. + self.sm_config = self.sr.srcmd.params['vdi_sm_config'] + + # 4. Create! + failed = False + try: + volume_name = None + if self.ty == 'ha_statefile': + volume_name = 'xcp-persistent-ha-statefile' + elif self.ty == 'redo_log': + volume_name = 'xcp-persistent-redo-log' + + self._linstor.create_volume( + self.uuid, volume_size, persistent=False, + volume_name=volume_name + ) + volume_info = self._linstor.get_volume_info(self.uuid) + + self._update_device_name(volume_info.name) + + if self.vdi_type == vhdutil.VDI_TYPE_RAW: + self.size = volume_info.virtual_size + else: + self.sr._vhdutil.create( + self.path, size, False, self.MAX_METADATA_VIRT_SIZE + ) + self.size = self.sr._vhdutil.get_size_virt(self.uuid) + + if self._key_hash: + self.sr._vhdutil.set_key(self.path, self._key_hash) + + # Because vhdutil commands modify the volume data, + # we must retrieve a new time the utilization size. + volume_info = self._linstor.get_volume_info(self.uuid) + + volume_metadata = { + NAME_LABEL_TAG: util.to_plain_string(self.label), + NAME_DESCRIPTION_TAG: util.to_plain_string(self.description), + IS_A_SNAPSHOT_TAG: False, + SNAPSHOT_OF_TAG: '', + SNAPSHOT_TIME_TAG: '', + TYPE_TAG: self.ty, + VDI_TYPE_TAG: self.vdi_type, + READ_ONLY_TAG: bool(self.read_only), + METADATA_OF_POOL_TAG: '' + } + self._linstor.set_volume_metadata(self.uuid, volume_metadata) + + # Set the open timeout to 1min to reduce CPU usage + # in http-disk-server when a secondary server tries to open + # an already opened volume. + if self.ty == 'ha_statefile' or self.ty == 'redo_log': + self._linstor.set_auto_promote_timeout(self.uuid, 600) + + # Increase `ping-timeout` parameter to ensure there is no failure in critical components like `tapdisk`. + # In fact a missed DRBD ACK packet causes EIO errors on `read/write` calls and completely blocks processes. + self._linstor.set_ping_timeout(self.uuid, 300) + + self._linstor.mark_volume_as_persistent(self.uuid) + except util.CommandException as e: + failed = True + raise xs_errors.XenError( + 'VDICreate', opterr='error {}'.format(e.code) + ) + except Exception as e: + failed = True + raise xs_errors.XenError('VDICreate', opterr='error {}'.format(e)) + finally: + if failed: + util.SMlog('Unable to create VDI {}'.format(self.uuid)) + try: + self._linstor.destroy_volume(self.uuid) + except Exception as e: + util.SMlog( + 'Ignoring exception after fail in LinstorVDI.create: ' + '{}'.format(e) + ) + + self.utilisation = volume_info.allocated_size + self.sm_config['vdi_type'] = self.vdi_type + + self.ref = self._db_introduce() + self.sr._update_stats(self.size) + + return VDI.VDI.get_params(self) + + def delete(self, sr_uuid, vdi_uuid, data_only=False): + util.SMlog('LinstorVDI.delete for {}'.format(self.uuid)) + if self.attached: + raise xs_errors.XenError('VDIInUse') + + if self.deleted: + return super(LinstorVDI, self).delete( + sr_uuid, vdi_uuid, data_only + ) + + vdi_ref = self.sr.srcmd.params['vdi_ref'] + if not self.session.xenapi.VDI.get_managed(vdi_ref): + raise xs_errors.XenError( + 'VDIDelete', + opterr='Deleting non-leaf node not permitted' + ) + + try: + # Remove from XAPI and delete from LINSTOR. + self._linstor.destroy_volume(self.uuid) + if not data_only: + self._db_forget() + + self.sr.lock.cleanupAll(vdi_uuid) + except Exception as e: + util.SMlog( + 'Failed to remove the volume (maybe is leaf coalescing) ' + 'for {} err: {}'.format(self.uuid, e) + ) + raise xs_errors.XenError('VDIDelete', opterr=str(e)) + + if self.uuid in self.sr.vdis: + del self.sr.vdis[self.uuid] + + # TODO: Check size after delete. + self.sr._update_stats(-self.size) + self.sr._kick_gc() + return super(LinstorVDI, self).delete(sr_uuid, vdi_uuid, data_only) + + def attach(self, sr_uuid, vdi_uuid): + util.SMlog('LinstorVDI.attach for {}'.format(self.uuid)) + attach_from_config = self.sr.srcmd.cmd == 'vdi_attach_from_config' + if ( + not attach_from_config or + self.sr.srcmd.params['vdi_uuid'] != self.uuid + ) and self.sr._journaler.has_entries(self.uuid): + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='Interrupted operation detected on this VDI, ' + 'scan SR first to trigger auto-repair' + ) + + if not attach_from_config or self.sr._is_master: + writable = 'args' not in self.sr.srcmd.params or \ + self.sr.srcmd.params['args'][0] == 'true' + + # We need to inflate the volume if we don't have enough place + # to mount the VHD image. I.e. the volume capacity must be greater + # than the VHD size + bitmap size. + need_inflate = True + if ( + self.vdi_type == vhdutil.VDI_TYPE_RAW or + not writable or + self.capacity >= compute_volume_size(self.size, self.vdi_type) + ): + need_inflate = False + + if need_inflate: + try: + self._prepare_thin(True) + except Exception as e: + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='Failed to attach VDI during "prepare thin": {}' + .format(e) + ) + + if not hasattr(self, 'xenstore_data'): + self.xenstore_data = {} + self.xenstore_data['storage-type'] = LinstorSR.DRIVER_TYPE + + if ( + USE_HTTP_NBD_SERVERS and + attach_from_config and + self.path.startswith('/dev/http-nbd/') + ): + return self._attach_using_http_nbd() + + # Ensure we have a path... + while vdi_uuid: + path = self._linstor.get_device_path(vdi_uuid) + if not util.pathexists(path): + raise xs_errors.XenError( + 'VDIUnavailable', opterr='Could not find: {}'.format(path) + ) + vdi_uuid = self.sr._vhdutil.get_vhd_info(vdi_uuid).parentUuid + + self.attached = True + return VDI.VDI.attach(self, self.sr.uuid, self.uuid) + + def detach(self, sr_uuid, vdi_uuid): + util.SMlog('LinstorVDI.detach for {}'.format(self.uuid)) + detach_from_config = self.sr.srcmd.cmd == 'vdi_detach_from_config' + self.attached = False + + if detach_from_config and self.path.startswith('/dev/http-nbd/'): + return self._detach_using_http_nbd() + + if self.vdi_type == vhdutil.VDI_TYPE_RAW: + return + + # The VDI is already deflated if the VHD image size + metadata is + # equal to the LINSTOR volume size. + volume_size = compute_volume_size(self.size, self.vdi_type) + already_deflated = self.capacity <= volume_size + + if already_deflated: + util.SMlog( + 'VDI {} already deflated (old volume size={}, volume size={})' + .format(self.uuid, self.capacity, volume_size) + ) + + need_deflate = True + if already_deflated: + need_deflate = False + elif self.sr._provisioning == 'thick': + need_deflate = False + + vdi_ref = self.sr.srcmd.params['vdi_ref'] + if self.session.xenapi.VDI.get_is_a_snapshot(vdi_ref): + need_deflate = True + + if need_deflate: + try: + self._prepare_thin(False) + except Exception as e: + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='Failed to detach VDI during "prepare thin": {}' + .format(e) + ) + + # We remove only on slaves because the volume can be used by the GC. + if self.sr._is_master: + return + + while vdi_uuid: + try: + path = self._linstor.build_device_path(self._linstor.get_volume_name(vdi_uuid)) + parent_vdi_uuid = self.sr._vhdutil.get_vhd_info(vdi_uuid).parentUuid + except Exception: + break + + if util.pathexists(path): + try: + self._linstor.remove_volume_if_diskless(vdi_uuid) + except Exception as e: + # Ensure we can always detach properly. + # I don't want to corrupt the XAPI info. + util.SMlog('Failed to clean VDI {} during detach: {}'.format(vdi_uuid, e)) + vdi_uuid = parent_vdi_uuid + + def resize(self, sr_uuid, vdi_uuid, size): + util.SMlog('LinstorVDI.resize for {}'.format(self.uuid)) + if not self.sr._is_master: + raise xs_errors.XenError( + 'VDISize', + opterr='resize on slave not allowed' + ) + + if self.hidden: + raise xs_errors.XenError('VDIUnavailable', opterr='hidden VDI') + + # Compute the virtual VHD and DRBD volume size. + size = vhdutil.validate_and_round_vhd_size(long(size)) + volume_size = compute_volume_size(size, self.vdi_type) + util.SMlog( + 'LinstorVDI.resize: type={}, vhd-size={}, volume-size={}' + .format(self.vdi_type, size, volume_size) + ) + + if size < self.size: + util.SMlog( + 'vdi_resize: shrinking not supported: ' + '(current size: {}, new size: {})'.format(self.size, size) + ) + raise xs_errors.XenError('VDISize', opterr='shrinking not allowed') + + if size == self.size: + return VDI.VDI.get_params(self) + + if self.vdi_type == vhdutil.VDI_TYPE_RAW: + old_volume_size = self.size + else: + old_volume_size = self.utilisation + if self.sr._provisioning == 'thin': + # VDI is currently deflated, so keep it deflated. + new_volume_size = old_volume_size + assert new_volume_size >= old_volume_size + + space_needed = new_volume_size - old_volume_size + self.sr._ensure_space_available(space_needed) + + old_size = self.size + if self.vdi_type == vhdutil.VDI_TYPE_RAW: + self._linstor.resize(self.uuid, new_volume_size) + else: + if new_volume_size != old_volume_size: + inflate( + self.sr._journaler, self._linstor, self.uuid, self.path, + new_volume_size, old_volume_size + ) + self.sr._vhdutil.set_size_virt_fast(self.path, size) + + # Reload size attributes. + self._load_this() + + vdi_ref = self.sr.srcmd.params['vdi_ref'] + self.session.xenapi.VDI.set_virtual_size(vdi_ref, str(self.size)) + self.session.xenapi.VDI.set_physical_utilisation( + vdi_ref, str(self.utilisation) + ) + self.sr._update_stats(self.size - old_size) + return VDI.VDI.get_params(self) + + def clone(self, sr_uuid, vdi_uuid): + return self._do_snapshot(sr_uuid, vdi_uuid, VDI.SNAPSHOT_DOUBLE) + + def compose(self, sr_uuid, vdi1, vdi2): + util.SMlog('VDI.compose for {} -> {}'.format(vdi2, vdi1)) + if self.vdi_type != vhdutil.VDI_TYPE_VHD: + raise xs_errors.XenError('Unimplemented') + + parent_uuid = vdi1 + parent_path = self._linstor.get_device_path(parent_uuid) + + # We must pause tapdisk to correctly change the parent. Otherwise we + # have a readonly error. + # See: https://github.com/xapi-project/xen-api/blob/b3169a16d36dae0654881b336801910811a399d9/ocaml/xapi/storage_migrate.ml#L928-L929 + # and: https://github.com/xapi-project/xen-api/blob/b3169a16d36dae0654881b336801910811a399d9/ocaml/xapi/storage_migrate.ml#L775 + + if not blktap2.VDI.tap_pause(self.session, self.sr.uuid, self.uuid): + raise util.SMException('Failed to pause VDI {}'.format(self.uuid)) + try: + self.sr._vhdutil.set_parent(self.path, parent_path, False) + self.sr._vhdutil.set_hidden(parent_path) + self.sr.session.xenapi.VDI.set_managed( + self.sr.srcmd.params['args'][0], False + ) + finally: + blktap2.VDI.tap_unpause(self.session, self.sr.uuid, self.uuid) + + if not blktap2.VDI.tap_refresh(self.session, self.sr.uuid, self.uuid): + raise util.SMException( + 'Failed to refresh VDI {}'.format(self.uuid) + ) + + util.SMlog('Compose done') + + def generate_config(self, sr_uuid, vdi_uuid): + """ + Generate the XML config required to attach and activate + a VDI for use when XAPI is not running. Attach and + activation is handled by vdi_attach_from_config below. + """ + + util.SMlog('LinstorVDI.generate_config for {}'.format(self.uuid)) + + resp = {} + resp['device_config'] = self.sr.dconf + resp['sr_uuid'] = sr_uuid + resp['vdi_uuid'] = self.uuid + resp['sr_sm_config'] = self.sr.sm_config + resp['command'] = 'vdi_attach_from_config' + + # By default, we generate a normal config. + # But if the disk is persistent, we must use a HTTP/NBD + # server to ensure we can always write or read data. + # Why? DRBD is unsafe when used with more than 4 hosts: + # We are limited to use 1 diskless and 3 full. + # We can't increase this limitation, so we use a NBD/HTTP device + # instead. + volume_name = self._linstor.get_volume_name(self.uuid) + if not USE_HTTP_NBD_SERVERS or volume_name not in [ + 'xcp-persistent-ha-statefile', 'xcp-persistent-redo-log' + ]: + if not self.path or not util.pathexists(self.path): + available = False + # Try to refresh symlink path... + try: + self.path = self._linstor.get_device_path(vdi_uuid) + available = util.pathexists(self.path) + except Exception: + pass + if not available: + raise xs_errors.XenError('VDIUnavailable') + + resp['vdi_path'] = self.path + else: + # Axiom: DRBD device is present on at least one host. + resp['vdi_path'] = '/dev/http-nbd/' + volume_name + + config = xmlrpclib.dumps(tuple([resp]), 'vdi_attach_from_config') + return xmlrpclib.dumps((config,), "", True) + + def attach_from_config(self, sr_uuid, vdi_uuid): + """ + Attach and activate a VDI using config generated by + vdi_generate_config above. This is used for cases such as + the HA state-file and the redo-log. + """ + + util.SMlog('LinstorVDI.attach_from_config for {}'.format(vdi_uuid)) + + try: + if not util.pathexists(self.sr.path): + self.sr.attach(sr_uuid) + + if not DRIVER_CONFIG['ATTACH_FROM_CONFIG_WITH_TAPDISK']: + return self.attach(sr_uuid, vdi_uuid) + except Exception: + util.logException('LinstorVDI.attach_from_config') + raise xs_errors.XenError( + 'SRUnavailable', + opterr='Unable to attach from config' + ) + + def reset_leaf(self, sr_uuid, vdi_uuid): + if self.vdi_type != vhdutil.VDI_TYPE_VHD: + raise xs_errors.XenError('Unimplemented') + + if not self.sr._vhdutil.has_parent(self.uuid): + raise util.SMException( + 'ERROR: VDI {} has no parent, will not reset contents' + .format(self.uuid) + ) + + self.sr._vhdutil.kill_data(self.path) + + def _load_this(self): + volume_metadata = None + if self.sr._all_volume_metadata_cache: + volume_metadata = self.sr._all_volume_metadata_cache.get(self.uuid) + if volume_metadata is None: + volume_metadata = self._linstor.get_volume_metadata(self.uuid) + + volume_info = None + if self.sr._all_volume_info_cache: + volume_info = self.sr._all_volume_info_cache.get(self.uuid) + if volume_info is None: + volume_info = self._linstor.get_volume_info(self.uuid) + + # Contains the max physical size used on a disk. + # When LINSTOR LVM driver is used, the size should be similar to + # virtual size (i.e. the LINSTOR max volume size). + # When LINSTOR Thin LVM driver is used, the used physical size should + # be lower than virtual size at creation. + # The physical size increases after each write in a new block. + self.utilisation = volume_info.allocated_size + self.capacity = volume_info.virtual_size + + if self.vdi_type == vhdutil.VDI_TYPE_RAW: + self.hidden = int(volume_metadata.get(HIDDEN_TAG) or 0) + self.size = volume_info.virtual_size + self.parent = '' + else: + vhd_info = self.sr._vhdutil.get_vhd_info(self.uuid) + self.hidden = vhd_info.hidden + self.size = vhd_info.sizeVirt + self.parent = vhd_info.parentUuid + + if self.hidden: + self.managed = False + + self.label = volume_metadata.get(NAME_LABEL_TAG) or '' + self.description = volume_metadata.get(NAME_DESCRIPTION_TAG) or '' + + # Update sm_config_override of VDI parent class. + self.sm_config_override = {'vhd-parent': self.parent or None} + + def _mark_hidden(self, hidden=True): + if self.hidden == hidden: + return + + if self.vdi_type == vhdutil.VDI_TYPE_VHD: + self.sr._vhdutil.set_hidden(self.path, hidden) + else: + self._linstor.update_volume_metadata(self.uuid, { + HIDDEN_TAG: hidden + }) + self.hidden = hidden + + def update(self, sr_uuid, vdi_uuid): + xenapi = self.session.xenapi + vdi_ref = xenapi.VDI.get_by_uuid(self.uuid) + + volume_metadata = { + NAME_LABEL_TAG: util.to_plain_string( + xenapi.VDI.get_name_label(vdi_ref) + ), + NAME_DESCRIPTION_TAG: util.to_plain_string( + xenapi.VDI.get_name_description(vdi_ref) + ) + } + + try: + self._linstor.update_volume_metadata(self.uuid, volume_metadata) + except LinstorVolumeManagerError as e: + if e.code == LinstorVolumeManagerError.ERR_VOLUME_NOT_EXISTS: + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='LINSTOR volume {} not found'.format(self.uuid) + ) + raise xs_errors.XenError('VDIUnavailable', opterr=str(e)) + + # -------------------------------------------------------------------------- + # Thin provisioning. + # -------------------------------------------------------------------------- + + def _prepare_thin(self, attach): + if self.sr._is_master: + if attach: + attach_thin( + self.session, self.sr._journaler, self._linstor, + self.sr.uuid, self.uuid + ) + else: + detach_thin( + self.session, self._linstor, self.sr.uuid, self.uuid + ) + else: + fn = 'attach' if attach else 'detach' + + master = util.get_master_ref(self.session) + + args = { + 'groupName': self.sr._group_name, + 'srUuid': self.sr.uuid, + 'vdiUuid': self.uuid + } + self.sr._exec_manager_command(master, fn, args, 'VDIUnavailable') + + # Reload size attrs after inflate or deflate! + self._load_this() + self.sr._update_physical_size() + + vdi_ref = self.sr.srcmd.params['vdi_ref'] + self.session.xenapi.VDI.set_physical_utilisation( + vdi_ref, str(self.utilisation) + ) + + self.session.xenapi.SR.set_physical_utilisation( + self.sr.sr_ref, str(self.sr.physical_utilisation) + ) + + # -------------------------------------------------------------------------- + # Generic helpers. + # -------------------------------------------------------------------------- + + def _determine_type_and_path(self): + """ + Determine whether this is a RAW or a VHD VDI. + """ + + # 1. Check vdi_ref and vdi_type in config. + try: + vdi_ref = self.session.xenapi.VDI.get_by_uuid(self.uuid) + if vdi_ref: + sm_config = self.session.xenapi.VDI.get_sm_config(vdi_ref) + vdi_type = sm_config.get('vdi_type') + if vdi_type: + # Update parent fields. + self.vdi_type = vdi_type + self.sm_config_override = sm_config + self._update_device_name( + self._linstor.get_volume_name(self.uuid) + ) + return + except Exception: + pass + + # 2. Otherwise use the LINSTOR volume manager directly. + # It's probably a new VDI created via snapshot. + volume_metadata = self._linstor.get_volume_metadata(self.uuid) + self.vdi_type = volume_metadata.get(VDI_TYPE_TAG) + if not self.vdi_type: + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='failed to get vdi_type in metadata' + ) + self._update_device_name(self._linstor.get_volume_name(self.uuid)) + + def _update_device_name(self, device_name): + self._device_name = device_name + + # Mark path of VDI parent class. + if device_name: + self.path = self._linstor.build_device_path(self._device_name) + else: + self.path = None + + def _create_snapshot(self, snap_uuid, snap_of_uuid=None): + """ + Snapshot self and return the snapshot VDI object. + """ + + # 1. Create a new LINSTOR volume with the same size than self. + snap_path = self._linstor.shallow_clone_volume( + self.uuid, snap_uuid, persistent=False + ) + + # 2. Write the snapshot content. + is_raw = (self.vdi_type == vhdutil.VDI_TYPE_RAW) + self.sr._vhdutil.snapshot( + snap_path, self.path, is_raw, self.MAX_METADATA_VIRT_SIZE + ) + + # 3. Get snapshot parent. + snap_parent = self.sr._vhdutil.get_parent(snap_uuid) + + # 4. Update metadata. + util.SMlog('Set VDI {} metadata of snapshot'.format(snap_uuid)) + volume_metadata = { + NAME_LABEL_TAG: util.to_plain_string(self.label), + NAME_DESCRIPTION_TAG: util.to_plain_string(self.description), + IS_A_SNAPSHOT_TAG: bool(snap_of_uuid), + SNAPSHOT_OF_TAG: snap_of_uuid, + SNAPSHOT_TIME_TAG: '', + TYPE_TAG: self.ty, + VDI_TYPE_TAG: vhdutil.VDI_TYPE_VHD, + READ_ONLY_TAG: False, + METADATA_OF_POOL_TAG: '' + } + self._linstor.set_volume_metadata(snap_uuid, volume_metadata) + + # 5. Set size. + snap_vdi = LinstorVDI(self.sr, snap_uuid) + if not snap_vdi._exists: + raise xs_errors.XenError('VDISnapshot') + + volume_info = self._linstor.get_volume_info(snap_uuid) + + snap_vdi.size = self.sr._vhdutil.get_size_virt(snap_uuid) + snap_vdi.utilisation = volume_info.allocated_size + + # 6. Update sm config. + snap_vdi.sm_config = {} + snap_vdi.sm_config['vdi_type'] = snap_vdi.vdi_type + if snap_parent: + snap_vdi.sm_config['vhd-parent'] = snap_parent + snap_vdi.parent = snap_parent + + snap_vdi.label = self.label + snap_vdi.description = self.description + + self._linstor.mark_volume_as_persistent(snap_uuid) + + return snap_vdi + + # -------------------------------------------------------------------------- + # Implement specific SR methods. + # -------------------------------------------------------------------------- + + def _rename(self, oldpath, newpath): + # TODO: I'm not sure... Used by CBT. + volume_uuid = self._linstor.get_volume_uuid_from_device_path(oldpath) + self._linstor.update_volume_name(volume_uuid, newpath) + + def _do_snapshot( + self, sr_uuid, vdi_uuid, snap_type, secondary=None, cbtlog=None + ): + # If cbt enabled, save file consistency state. + if cbtlog is not None: + if blktap2.VDI.tap_status(self.session, vdi_uuid): + consistency_state = False + else: + consistency_state = True + util.SMlog( + 'Saving log consistency state of {} for vdi: {}' + .format(consistency_state, vdi_uuid) + ) + else: + consistency_state = None + + if self.vdi_type != vhdutil.VDI_TYPE_VHD: + raise xs_errors.XenError('Unimplemented') + + if not blktap2.VDI.tap_pause(self.session, sr_uuid, vdi_uuid): + raise util.SMException('Failed to pause VDI {}'.format(vdi_uuid)) + try: + return self._snapshot(snap_type, cbtlog, consistency_state) + finally: + blktap2.VDI.tap_unpause(self.session, sr_uuid, vdi_uuid, secondary) + + def _snapshot(self, snap_type, cbtlog=None, cbt_consistency=None): + util.SMlog( + 'LinstorVDI._snapshot for {} (type {})' + .format(self.uuid, snap_type) + ) + + # 1. Checks... + if self.hidden: + raise xs_errors.XenError('VDIClone', opterr='hidden VDI') + + depth = self.sr._vhdutil.get_depth(self.uuid) + if depth == -1: + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='failed to get VHD depth' + ) + elif depth >= vhdutil.MAX_CHAIN_SIZE: + raise xs_errors.XenError('SnapshotChainTooLong') + + # Ensure we have a valid path if we don't have a local diskful. + self.sr._linstor.get_device_path(self.uuid) + + volume_path = self.path + if not util.pathexists(volume_path): + raise xs_errors.XenError( + 'EIO', + opterr='IO error checking path {}'.format(volume_path) + ) + + # 2. Create base and snap uuid (if required) and a journal entry. + base_uuid = util.gen_uuid() + snap_uuid = None + + if snap_type == VDI.SNAPSHOT_DOUBLE: + snap_uuid = util.gen_uuid() + + clone_info = '{}_{}'.format(base_uuid, snap_uuid) + + active_uuid = self.uuid + self.sr._journaler.create( + LinstorJournaler.CLONE, active_uuid, clone_info + ) + + try: + # 3. Self becomes the new base. + # The device path remains the same. + self._linstor.update_volume_uuid(self.uuid, base_uuid) + self.uuid = base_uuid + self.location = self.uuid + self.read_only = True + self.managed = False + + # 4. Create snapshots (new active and snap). + active_vdi = self._create_snapshot(active_uuid) + + snap_vdi = None + if snap_type == VDI.SNAPSHOT_DOUBLE: + snap_vdi = self._create_snapshot(snap_uuid, active_uuid) + + self.label = 'base copy' + self.description = '' + + # 5. Mark the base VDI as hidden so that it does not show up + # in subsequent scans. + self._mark_hidden() + self._linstor.update_volume_metadata( + self.uuid, {READ_ONLY_TAG: True} + ) + + # 6. We must update the new active VDI with the "paused" and + # "host_" properties. Why? Because the original VDI has been + # paused and we we must unpause it after the snapshot. + # See: `tap_unpause` in `blktap2.py`. + vdi_ref = self.session.xenapi.VDI.get_by_uuid(active_uuid) + sm_config = self.session.xenapi.VDI.get_sm_config(vdi_ref) + for key in filter( + lambda x: x == 'paused' or x.startswith('host_'), + sm_config.keys() + ): + active_vdi.sm_config[key] = sm_config[key] + + # 7. Verify parent locator field of both children and + # delete base if unused. + introduce_parent = True + try: + snap_parent = None + if snap_vdi: + snap_parent = snap_vdi.parent + + if active_vdi.parent != self.uuid and ( + snap_type == VDI.SNAPSHOT_SINGLE or + snap_type == VDI.SNAPSHOT_INTERNAL or + snap_parent != self.uuid + ): + util.SMlog( + 'Destroy unused base volume: {} (path={})' + .format(self.uuid, self.path) + ) + introduce_parent = False + self._linstor.destroy_volume(self.uuid) + except Exception as e: + util.SMlog('Ignoring exception: {}'.format(e)) + pass + + # 8. Introduce the new VDI records. + if snap_vdi: + # If the parent is encrypted set the key_hash for the + # new snapshot disk. + vdi_ref = self.sr.srcmd.params['vdi_ref'] + sm_config = self.session.xenapi.VDI.get_sm_config(vdi_ref) + # TODO: Maybe remove key_hash support. + if 'key_hash' in sm_config: + snap_vdi.sm_config['key_hash'] = sm_config['key_hash'] + # If we have CBT enabled on the VDI, + # set CBT status for the new snapshot disk. + if cbtlog: + snap_vdi.cbt_enabled = True + + if snap_vdi: + snap_vdi_ref = snap_vdi._db_introduce() + util.SMlog( + 'vdi_clone: introduced VDI: {} ({})' + .format(snap_vdi_ref, snap_vdi.uuid) + ) + if introduce_parent: + base_vdi_ref = self._db_introduce() + self.session.xenapi.VDI.set_managed(base_vdi_ref, False) + util.SMlog( + 'vdi_clone: introduced VDI: {} ({})' + .format(base_vdi_ref, self.uuid) + ) + self._linstor.update_volume_metadata(self.uuid, { + NAME_LABEL_TAG: util.to_plain_string(self.label), + NAME_DESCRIPTION_TAG: util.to_plain_string( + self.description + ), + READ_ONLY_TAG: True, + METADATA_OF_POOL_TAG: '' + }) + + # 9. Update cbt files if user created snapshot (SNAPSHOT_DOUBLE) + if snap_type == VDI.SNAPSHOT_DOUBLE and cbtlog: + try: + self._cbt_snapshot(snap_uuid, cbt_consistency) + except Exception: + # CBT operation failed. + # TODO: Implement me. + raise + + if snap_type != VDI.SNAPSHOT_INTERNAL: + self.sr._update_stats(self.size) + + # 10. Return info on the new user-visible leaf VDI. + ret_vdi = snap_vdi + if not ret_vdi: + ret_vdi = self + if not ret_vdi: + ret_vdi = active_vdi + + vdi_ref = self.sr.srcmd.params['vdi_ref'] + self.session.xenapi.VDI.set_sm_config( + vdi_ref, active_vdi.sm_config + ) + except Exception as e: + util.logException('Failed to snapshot!') + try: + self.sr._handle_interrupted_clone( + active_uuid, clone_info, force_undo=True + ) + self.sr._journaler.remove(LinstorJournaler.CLONE, active_uuid) + except Exception as e: + util.SMlog( + 'WARNING: Failed to clean up failed snapshot: {}' + .format(e) + ) + raise xs_errors.XenError('VDIClone', opterr=str(e)) + + self.sr._journaler.remove(LinstorJournaler.CLONE, active_uuid) + + return ret_vdi.get_params() + + @staticmethod + def _start_persistent_http_server(volume_name): + pid_path = None + http_server = None + + try: + if volume_name == 'xcp-persistent-ha-statefile': + port = '8076' + else: + port = '8077' + + try: + # Use a timeout call because XAPI may be unusable on startup + # or if the host has been ejected. So in this case the call can + # block indefinitely. + session = util.timeout_call(5, util.get_localAPI_session) + host_ip = util.get_this_host_address(session) + except: + # Fallback using the XHA file if session not available. + host_ip, _ = get_ips_from_xha_config_file() + if not host_ip: + raise Exception( + 'Cannot start persistent HTTP server: no XAPI session, nor XHA config file' + ) + + arguments = [ + 'http-disk-server', + '--disk', + '/dev/drbd/by-res/{}/0'.format(volume_name), + '--ip', + host_ip, + '--port', + port + ] + + util.SMlog('Starting {} on port {}...'.format(arguments[0], port)) + http_server = subprocess.Popen( + [FORK_LOG_DAEMON] + arguments, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + # Ensure we use another group id to kill this process without + # touch the current one. + preexec_fn=os.setsid + ) + + pid_path = '/run/http-server-{}.pid'.format(volume_name) + with open(pid_path, 'w') as pid_file: + pid_file.write(str(http_server.pid)) + + reg_server_ready = re.compile("Server ready!$") + def is_ready(): + while http_server.poll() is None: + line = http_server.stdout.readline() + if reg_server_ready.search(line): + return True + return False + try: + if not util.timeout_call(10, is_ready): + raise Exception('Failed to wait HTTP server startup, bad output') + except util.TimeoutException: + raise Exception('Failed to wait for HTTP server startup during given delay') + except Exception as e: + if pid_path: + try: + os.remove(pid_path) + except Exception: + pass + + if http_server: + # Kill process and children in this case... + try: + os.killpg(os.getpgid(http_server.pid), signal.SIGTERM) + except: + pass + + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='Failed to start http-server: {}'.format(e) + ) + + def _start_persistent_nbd_server(self, volume_name): + pid_path = None + nbd_path = None + nbd_server = None + + try: + # We use a precomputed device size. + # So if the XAPI is modified, we must update these values! + if volume_name == 'xcp-persistent-ha-statefile': + # See: https://github.com/xapi-project/xen-api/blob/703479fa448a8d7141954bb6e8964d8e25c4ac2e/ocaml/xapi/xha_statefile.ml#L32-L37 + port = '8076' + device_size = 4 * 1024 * 1024 + else: + # See: https://github.com/xapi-project/xen-api/blob/703479fa448a8d7141954bb6e8964d8e25c4ac2e/ocaml/database/redo_log.ml#L41-L44 + port = '8077' + device_size = 256 * 1024 * 1024 + + try: + session = util.timeout_call(5, util.get_localAPI_session) + ips = util.get_host_addresses(session) + except Exception as e: + _, ips = get_ips_from_xha_config_file() + if not ips: + raise Exception( + 'Cannot start persistent NBD server: no XAPI session, nor XHA config file ({})'.format(e) + ) + ips = ips.values() + + arguments = [ + 'nbd-http-server', + '--socket-path', + '/run/{}.socket'.format(volume_name), + '--nbd-name', + volume_name, + '--urls', + ','.join(map(lambda ip: 'http://' + ip + ':' + port, ips)), + '--device-size', + str(device_size) + ] + + util.SMlog('Starting {} using port {}...'.format(arguments[0], port)) + nbd_server = subprocess.Popen( + [FORK_LOG_DAEMON] + arguments, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + # Ensure we use another group id to kill this process without + # touch the current one. + preexec_fn=os.setsid + ) + + pid_path = '/run/nbd-server-{}.pid'.format(volume_name) + with open(pid_path, 'w') as pid_file: + pid_file.write(str(nbd_server.pid)) + + reg_nbd_path = re.compile("NBD `(/dev/nbd[0-9]+)` is now attached.$") + def get_nbd_path(): + while nbd_server.poll() is None: + line = nbd_server.stdout.readline() + match = reg_nbd_path.search(line) + if match: + return match.group(1) + # Use a timeout to never block the smapi if there is a problem. + try: + nbd_path = util.timeout_call(10, get_nbd_path) + if nbd_path is None: + raise Exception('Empty NBD path (NBD server is probably dead)') + except util.TimeoutException: + raise Exception('Unable to read NBD path') + + util.SMlog('Create symlink: {} -> {}'.format(self.path, nbd_path)) + os.symlink(nbd_path, self.path) + except Exception as e: + if pid_path: + try: + os.remove(pid_path) + except Exception: + pass + + if nbd_path: + try: + os.remove(nbd_path) + except Exception: + pass + + if nbd_server: + # Kill process and children in this case... + try: + os.killpg(os.getpgid(nbd_server.pid), signal.SIGTERM) + except: + pass + + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='Failed to start nbd-server: {}'.format(e) + ) + + @classmethod + def _kill_persistent_server(self, type, volume_name, sig): + try: + path = '/run/{}-server-{}.pid'.format(type, volume_name) + if not os.path.exists(path): + return + + pid = None + with open(path, 'r') as pid_file: + try: + pid = int(pid_file.read()) + except Exception: + pass + + if pid is not None and util.check_pid_exists(pid): + util.SMlog('Kill {} server {} (pid={})'.format(type, path, pid)) + try: + os.killpg(os.getpgid(pid), sig) + except Exception as e: + util.SMlog('Failed to kill {} server: {}'.format(type, e)) + + os.remove(path) + except: + pass + + @classmethod + def _kill_persistent_http_server(self, volume_name, sig=signal.SIGTERM): + return self._kill_persistent_server('nbd', volume_name, sig) + + @classmethod + def _kill_persistent_nbd_server(self, volume_name, sig=signal.SIGTERM): + return self._kill_persistent_server('http', volume_name, sig) + + def _check_http_nbd_volume_name(self): + volume_name = self.path[14:] + if volume_name not in [ + 'xcp-persistent-ha-statefile', 'xcp-persistent-redo-log' + ]: + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='Unsupported path: {}'.format(self.path) + ) + return volume_name + + def _attach_using_http_nbd(self): + volume_name = self._check_http_nbd_volume_name() + + # Ensure there is no NBD and HTTP server running. + self._kill_persistent_nbd_server(volume_name) + self._kill_persistent_http_server(volume_name) + + # 0. Fetch drbd path. + must_get_device_path = True + if not self.sr._is_master: + # We are on a slave, we must try to find a diskful locally. + try: + volume_info = self._linstor.get_volume_info(self.uuid) + except Exception as e: + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='Cannot get volume info of {}: {}' + .format(self.uuid, e) + ) + + hostname = socket.gethostname() + must_get_device_path = hostname in volume_info.diskful + + drbd_path = None + if must_get_device_path or self.sr._is_master: + # If we are master, we must ensure we have a diskless + # or diskful available to init HA. + # It also avoid this error in xensource.log + # (/usr/libexec/xapi/cluster-stack/xhad/ha_set_pool_state): + # init exited with code 8 [stdout = ''; stderr = 'SF: failed to write in State-File \x10 (fd 4208696). (sys 28)\x0A'] + # init returned MTC_EXIT_CAN_NOT_ACCESS_STATEFILE (State-File is inaccessible) + available = False + try: + drbd_path = self._linstor.get_device_path(self.uuid) + available = util.pathexists(drbd_path) + except Exception: + pass + + if not available: + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='Cannot get device path of {}'.format(self.uuid) + ) + + # 1. Prepare http-nbd folder. + try: + if not os.path.exists('/dev/http-nbd/'): + os.makedirs('/dev/http-nbd/') + elif os.path.islink(self.path): + os.remove(self.path) + except OSError as e: + if e.errno != errno.EEXIST: + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='Cannot prepare http-nbd: {}'.format(e) + ) + + # 2. Start HTTP service if we have a diskful or if we are master. + http_service = None + if drbd_path: + assert(drbd_path in ( + '/dev/drbd/by-res/xcp-persistent-ha-statefile/0', + '/dev/drbd/by-res/xcp-persistent-redo-log/0' + )) + self._start_persistent_http_server(volume_name) + + # 3. Start NBD server in all cases. + try: + self._start_persistent_nbd_server(volume_name) + except Exception as e: + if drbd_path: + self._kill_persistent_http_server(volume_name) + raise + + self.attached = True + return VDI.VDI.attach(self, self.sr.uuid, self.uuid) + + def _detach_using_http_nbd(self): + volume_name = self._check_http_nbd_volume_name() + self._kill_persistent_nbd_server(volume_name) + self._kill_persistent_http_server(volume_name) + +# ------------------------------------------------------------------------------ + + +if __name__ == '__main__': + def run(): + SRCommand.run(LinstorSR, DRIVER_INFO) + + if not TRACE_PERFS: + run() + else: + util.make_profile('LinstorSR', run) +else: + SR.registerSR(LinstorSR) diff --git a/drivers/MooseFSSR.py b/drivers/MooseFSSR.py new file mode 100755 index 00000000..212f1ad2 --- /dev/null +++ b/drivers/MooseFSSR.py @@ -0,0 +1,319 @@ +#!/usr/bin/env python +# +# Original work copyright (C) Citrix systems +# Modified work copyright (C) Tappest sp. z o.o., Vates SAS and XCP-ng community +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published +# by the Free Software Foundation; version 2.1 only. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +# +# MooseFSSR: Based on CEPHFSSR and FileSR, mounts MooseFS share + +import distutils.util +import errno +import os +import syslog as _syslog +import xmlrpclib +from syslog import syslog + +# careful with the import order here +# FileSR has a circular dependency: +# FileSR -> blktap2 -> lvutil -> EXTSR -> FileSR +# importing in this order seems to avoid triggering the issue. +import SR +import SRCommand +import FileSR +# end of careful +import cleanup +import util +import vhdutil +import xs_errors +from lock import Lock + +CAPABILITIES = ["SR_PROBE", "SR_UPDATE", + "VDI_CREATE", "VDI_DELETE", "VDI_ATTACH", "VDI_DETACH", + "VDI_UPDATE", "VDI_CLONE", "VDI_SNAPSHOT", "VDI_RESIZE", "VDI_MIRROR", + "VDI_GENERATE_CONFIG", + "VDI_RESET_ON_BOOT/2", "ATOMIC_PAUSE"] + +CONFIGURATION = [ + ['masterhost', 'MooseFS Master Server hostname or IP address (required, e.g.: "mfsmaster.local.lan" or "10.10.10.1")'], + ['masterport', 'MooseFS Master Server port, default: 9421'], + ['rootpath', 'MooseFS path (required, e.g.: "/")'], + ['options', 'MooseFS Client additional options (e.g.: "mfspassword=PASSWORD,mfstimeout=300")'] +] + +DRIVER_INFO = { + 'name': 'MooseFS VHD', + 'description': 'SR plugin which stores disks as VHD files on a MooseFS storage', + 'vendor': 'Tappest sp. z o.o.', + 'copyright': '(C) 2021 Tappest sp. z o.o.', + 'driver_version': '1.0', + 'required_api_version': '1.0', + 'capabilities': CAPABILITIES, + 'configuration': CONFIGURATION +} + +DRIVER_CONFIG = {"ATTACH_FROM_CONFIG_WITH_TAPDISK": True} + +# The mountpoint for the directory when performing an sr_probe. All probes +# are guaranteed to be serialised by xapi, so this single mountpoint is fine. +PROBE_MOUNTPOINT = os.path.join(SR.MOUNT_BASE, "probe") + + +class MooseFSException(Exception): + def __init__(self, errstr): + self.errstr = errstr + + +class MooseFSSR(FileSR.FileSR): + """MooseFS file-based storage""" + + DRIVER_TYPE = 'moosefs' + + def handles(sr_type): + # fudge, because the parent class (FileSR) checks for smb to alter its behavior + return sr_type == MooseFSSR.DRIVER_TYPE or sr_type == 'smb' + + handles = staticmethod(handles) + + def load(self, sr_uuid): + if not self._is_moosefs_available(): + raise xs_errors.XenError( + 'SRUnavailable', + opterr='MooseFS Client is not installed!' + ) + + self.ops_exclusive = FileSR.OPS_EXCLUSIVE + self.lock = Lock(vhdutil.LOCK_TYPE_SR, self.uuid) + self.sr_vditype = SR.DEFAULT_TAP + self.driver_config = DRIVER_CONFIG + if 'masterhost' not in self.dconf: + raise xs_errors.XenError('ConfigServerMissing') + self.remoteserver = self.dconf['masterhost'] + self.rootpath = self.dconf['rootpath'] + self.remotepath = self.rootpath + # if masterport is not specified, use default: 9421 + if 'masterport' not in self.dconf: + self.remoteport = "9421" + else: + self.remoteport = self.dconf['masterport'] + if self.sr_ref and self.session is not None: + self.sm_config = self.session.xenapi.SR.get_sm_config(self.sr_ref) + else: + self.sm_config = self.srcmd.params.get('sr_sm_config') or {} + + if self.srcmd.cmd != 'sr_create': + self.subdir = distutils.util.strtobool( + self.sm_config.get('subdir') or '0' + ) + if self.subdir: + self.remotepath = os.path.join(self.remotepath, sr_uuid) + + self.attached = False + self.path = os.path.join(SR.MOUNT_BASE, sr_uuid) + self.mountpoint = self.path + self.linkpath = self.path + self._check_o_direct() + + def checkmount(self): + return util.ioretry(lambda: ((util.pathexists(self.mountpoint) and + util.ismount(self.mountpoint)))) + + def mount(self, mountpoint=None): + """Mount MooseFS share at 'mountpoint'""" + if mountpoint is None: + mountpoint = self.mountpoint + elif not util.is_string(mountpoint) or mountpoint == "": + raise MooseFSException("Mountpoint is not a string object") + + try: + if not util.ioretry(lambda: util.isdir(mountpoint)): + util.ioretry(lambda: util.makedirs(mountpoint)) + except util.CommandException, inst: + raise MooseFSException("Failed to make directory: code is %d" % inst.code) + + try: + options = [] + if self.dconf.has_key('options'): + options.append(self.dconf['options']) + if options: + options = ['-o', ','.join(options)] + remote = '{}:{}:{}'.format( + self.remoteserver, self.remoteport, self.remotepath + ) + command = ["mount", '-t', 'moosefs', remote, mountpoint] + options + util.ioretry(lambda: util.pread(command), errlist=[errno.EPIPE, errno.EIO], maxretry=2, nofail=True) + except util.CommandException, inst: + syslog(_syslog.LOG_ERR, 'MooseFS mount failed ' + inst.__str__()) + raise MooseFSException("Mount failed with return code %d" % inst.code) + + # Sanity check to ensure that the user has at least RO access to the + # mounted share. Windows sharing and security settings can be tricky. + try: + util.listdir(mountpoint) + except util.CommandException: + try: + self.unmount(mountpoint, True) + except MooseFSException: + util.logException('MooseFSSR.unmount()') + raise MooseFSException("Permission denied. Please check user privileges.") + + def unmount(self, mountpoint, rmmountpoint): + try: + util.pread(["umount", mountpoint]) + except util.CommandException, inst: + raise MooseFSException("Command umount failed with return code %d" % inst.code) + if rmmountpoint: + try: + os.rmdir(mountpoint) + except OSError, inst: + raise MooseFSException("Command rmdir failed with error '%s'" % inst.strerror) + + def attach(self, sr_uuid): + if not self.checkmount(): + try: + self.mount() + except MooseFSException, exc: + raise SR.SROSError(12, exc.errstr) + self.attached = True + + def probe(self): + try: + self.mount(PROBE_MOUNTPOINT) + sr_list = filter(util.match_uuid, util.listdir(PROBE_MOUNTPOINT)) + self.unmount(PROBE_MOUNTPOINT, True) + except (util.CommandException, xs_errors.XenError): + raise + # Create a dictionary from the SR uuids to feed SRtoXML() + sr_dict = {sr_uuid: {} for sr_uuid in sr_list} + return util.SRtoXML(sr_dict) + + def detach(self, sr_uuid): + if not self.checkmount(): + return + util.SMlog("Aborting GC/coalesce") + cleanup.abort(sr_uuid) + # Change directory to avoid unmount conflicts + os.chdir(SR.MOUNT_BASE) + self.unmount(self.mountpoint, True) + self.attached = False + + def create(self, sr_uuid, size): + if self.checkmount(): + raise SR.SROSError(113, 'MooseFS mount point already attached') + + assert self.remotepath == self.rootpath + try: + self.mount() + except MooseFSException, exc: + # noinspection PyBroadException + try: + os.rmdir(self.mountpoint) + except: + # we have no recovery strategy + pass + raise SR.SROSError(111, "MooseFS mount error [opterr=%s]" % exc.errstr) + + try: + self.subdir = self.sm_config.get('subdir') + if self.subdir is None: + self.subdir = True + else: + self.subdir = distutils.util.strtobool(self.subdir) + + self.sm_config['subdir'] = str(self.subdir) + self.session.xenapi.SR.set_sm_config(self.sr_ref, self.sm_config) + + if not self.subdir: + return + + subdir = os.path.join(self.mountpoint, sr_uuid) + if util.ioretry(lambda: util.pathexists(subdir)): + if util.ioretry(lambda: util.isdir(subdir)): + raise xs_errors.XenError('SRExists') + else: + try: + util.ioretry(lambda: util.makedirs(subdir)) + except util.CommandException as e: + if e.code != errno.EEXIST: + raise MooseFSException( + 'Failed to create SR subdir: {}'.format(e) + ) + finally: + self.detach(sr_uuid) + + def delete(self, sr_uuid): + # try to remove/delete non VDI contents first + super(MooseFSSR, self).delete(sr_uuid) + try: + if self.checkmount(): + self.detach(sr_uuid) + + if self.subdir: + # Mount using rootpath () instead of /. + self.remotepath = self.rootpath + self.attach(sr_uuid) + subdir = os.path.join(self.mountpoint, sr_uuid) + if util.ioretry(lambda: util.pathexists(subdir)): + util.ioretry(lambda: os.rmdir(subdir)) + self.detach(sr_uuid) + except util.CommandException, inst: + self.detach(sr_uuid) + if inst.code != errno.ENOENT: + raise SR.SROSError(114, "Failed to remove MooseFS mount point") + + def vdi(self, uuid, loadLocked=False): + return MooseFSFileVDI(self, uuid) + + @staticmethod + def _is_moosefs_available(): + import distutils.spawn + return distutils.spawn.find_executable('mfsmount') + +class MooseFSFileVDI(FileSR.FileVDI): + def attach(self, sr_uuid, vdi_uuid): + if not hasattr(self, 'xenstore_data'): + self.xenstore_data = {} + + self.xenstore_data['storage-type'] = MooseFSSR.DRIVER_TYPE + + return super(MooseFSFileVDI, self).attach(sr_uuid, vdi_uuid) + + def generate_config(self, sr_uuid, vdi_uuid): + util.SMlog("MooseFSFileVDI.generate_config") + if not util.pathexists(self.path): + raise xs_errors.XenError('VDIUnavailable') + resp = {'device_config': self.sr.dconf, + 'sr_uuid': sr_uuid, + 'vdi_uuid': vdi_uuid, + 'sr_sm_config': self.sr.sm_config, + 'command': 'vdi_attach_from_config'} + # Return the 'config' encoded within a normal XMLRPC response so that + # we can use the regular response/error parsing code. + config = xmlrpclib.dumps(tuple([resp]), "vdi_attach_from_config") + return xmlrpclib.dumps((config,), "", True) + + def attach_from_config(self, sr_uuid, vdi_uuid): + try: + if not util.pathexists(self.sr.path): + self.sr.attach(sr_uuid) + except: + util.logException("MooseFSFileVDI.attach_from_config") + raise xs_errors.XenError('SRUnavailable', + opterr='Unable to attach from config') + + +if __name__ == '__main__': + SRCommand.run(MooseFSSR, DRIVER_INFO) +else: + SR.registerSR(MooseFSSR) diff --git a/drivers/NFSSR.py b/drivers/NFSSR.py index 1fd32b43..e3ab3252 100755 --- a/drivers/NFSSR.py +++ b/drivers/NFSSR.py @@ -83,9 +83,12 @@ def load(self, sr_uuid): self.sm_config = self.srcmd.params.get('sr_sm_config') or {} self.other_config = self.srcmd.params.get('sr_other_config') or {} self.nosubdir = self.sm_config.get('nosubdir') == "true" - if self.dconf.has_key('serverpath'): - self.remotepath = os.path.join(self.dconf['serverpath'], - not self.nosubdir and sr_uuid or "").encode('utf-8') + serverpath = self.dconf.get('serverpath') + if serverpath is not None: + self.remotepath = os.path.join( + serverpath, + not self.nosubdir and sr_uuid or "" + ).encode('utf-8') self.path = os.path.join(SR.MOUNT_BASE, sr_uuid) # Handle optional dconf attributes @@ -100,7 +103,8 @@ def load(self, sr_uuid): def validate_remotepath(self, scan): - if not self.dconf.has_key('serverpath'): + serverpath = self.dconf.get('serverpath') + if serverpath is None: if scan: try: self.scan_exports(self.dconf['server']) diff --git a/drivers/SR.py b/drivers/SR.py index 8cb516d3..be693d6b 100755 --- a/drivers/SR.py +++ b/drivers/SR.py @@ -49,6 +49,15 @@ def __init__(self, errno, reason): self.errno = errno Exception.__init__(self, reason) + +def deviceCheck(op): + def wrapper(self, *args): + if 'device' not in self.dconf: + raise xs_errors.XenError('ConfigDeviceMissing') + return op(self, *args) + return wrapper + + backends = [] def registerSR(SRClass): """Register SR with handler. All SR subclasses should call this in diff --git a/drivers/XE_SR_ERRORCODES.xml b/drivers/XE_SR_ERRORCODES.xml index 97236fe0..fa87109a 100755 --- a/drivers/XE_SR_ERRORCODES.xml +++ b/drivers/XE_SR_ERRORCODES.xml @@ -887,5 +887,51 @@ 1200 + + ZFSSRCreate + ZFS SR creation error + 5000 + + + + ZFSSRDelete + ZFS SR deletion error + 5001 + + + + LinstorMaster + Linstor request must come from master + 5002 + + + + LinstorConfigHostsMissing + The request is missing the LINSTOR hosts parameter + 5003 + + + + LinstorConfigGroupNameMissing + The request is missing the LINSTOR group name parameter + 5004 + + + + LinstorConfigRedundancyMissing + The request is missing the LINSTOR redundancy parameter + 5005 + + + + LinstorSRCreate + LINSTOR SR creation error + 5006 + + + LinstorSRDelete + LINSTOR SR delete error + 5007 + diff --git a/drivers/XFSSR.py b/drivers/XFSSR.py new file mode 100644 index 00000000..de35d738 --- /dev/null +++ b/drivers/XFSSR.py @@ -0,0 +1,242 @@ +#!/usr/bin/env python +# +# Original work copyright (C) Citrix systems +# Modified work copyright (C) Vates SAS and XCP-ng community +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published +# by the Free Software Foundation; version 2.1 only. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +# +# XFSSR: Based on local-file storage repository, mounts xfs partition + +import SR, SRCommand, FileSR, util, lvutil, scsiutil + +import os +import xs_errors +import vhdutil +from lock import Lock +from constants import EXT_PREFIX + +CAPABILITIES = ["SR_PROBE","SR_UPDATE", "SR_SUPPORTS_LOCAL_CACHING", \ + "VDI_CREATE","VDI_DELETE","VDI_ATTACH","VDI_DETACH", \ + "VDI_UPDATE","VDI_CLONE","VDI_SNAPSHOT","VDI_RESIZE","VDI_MIRROR", \ + "VDI_GENERATE_CONFIG", \ + "VDI_RESET_ON_BOOT/2","ATOMIC_PAUSE", "VDI_CONFIG_CBT", + "VDI_ACTIVATE", "VDI_DEACTIVATE"] + +CONFIGURATION = [ [ 'device', 'local device path (required) (e.g. /dev/sda3)' ] ] + +DRIVER_INFO = { + 'name': 'Local XFS VHD', + 'description': 'SR plugin which represents disks as VHD files stored on a local XFS filesystem, created inside an LVM volume', + 'vendor': 'Vates SAS', + 'copyright': '(C) 2019 Vates SAS', + 'driver_version': '1.0', + 'required_api_version': '1.0', + 'capabilities': CAPABILITIES, + 'configuration': CONFIGURATION + } + +DRIVER_CONFIG = {"ATTACH_FROM_CONFIG_WITH_TAPDISK": True} + + +class XFSSR(FileSR.FileSR): + DRIVER_TYPE = 'xfs' + + """XFS Local file storage repository""" + def handles(srtype): + return srtype == XFSSR.DRIVER_TYPE + handles = staticmethod(handles) + + def load(self, sr_uuid): + if not self._is_xfs_available(): + raise xs_errors.XenError( + 'SRUnavailable', + opterr='xfsprogs is not installed' + ) + + self.ops_exclusive = FileSR.OPS_EXCLUSIVE + self.lock = Lock(vhdutil.LOCK_TYPE_SR, self.uuid) + self.sr_vditype = SR.DEFAULT_TAP + if not self.dconf.has_key('device') or not self.dconf['device']: + raise xs_errors.XenError('ConfigDeviceMissing') + + self.root = self.dconf['device'] + self.path = os.path.join(SR.MOUNT_BASE, sr_uuid) + self.vgname = EXT_PREFIX + sr_uuid + self.remotepath = os.path.join("/dev",self.vgname,sr_uuid) + self.attached = self._checkmount() + self.driver_config = DRIVER_CONFIG + + def delete(self, sr_uuid): + super(XFSSR, self).delete(sr_uuid) + + # Check PVs match VG + try: + for dev in self.root.split(','): + cmd = ["pvs", dev] + txt = util.pread2(cmd) + if txt.find(self.vgname) == -1: + raise xs_errors.XenError('VolNotFound', \ + opterr='volume is %s' % self.vgname) + except util.CommandException, inst: + raise xs_errors.XenError('PVSfailed', \ + opterr='error is %d' % inst.code) + + # Remove LV, VG and pv + try: + cmd = ["lvremove", "-f", self.remotepath] + util.pread2(cmd) + + cmd = ["vgremove", self.vgname] + util.pread2(cmd) + + for dev in self.root.split(','): + cmd = ["pvremove", dev] + util.pread2(cmd) + except util.CommandException, inst: + raise xs_errors.XenError('LVMDelete', \ + opterr='errno is %d' % inst.code) + + def attach(self, sr_uuid): + if not self._checkmount(): + try: + #Activate LV + cmd = ['lvchange','-ay',self.remotepath] + util.pread2(cmd) + + # make a mountpoint: + if not os.path.isdir(self.path): + os.makedirs(self.path) + except util.CommandException, inst: + raise xs_errors.XenError('LVMMount', \ + opterr='Unable to activate LV. Errno is %d' % inst.code) + + try: + util.pread(["fsck", "-a", self.remotepath]) + except util.CommandException, inst: + if inst.code == 1: + util.SMlog("FSCK detected and corrected FS errors. Not fatal.") + else: + raise xs_errors.XenError('LVMMount', \ + opterr='FSCK failed on %s. Errno is %d' % (self.remotepath,inst.code)) + + try: + util.pread(["mount", self.remotepath, self.path]) + except util.CommandException, inst: + raise xs_errors.XenError('LVMMount', \ + opterr='Failed to mount FS. Errno is %d' % inst.code) + + self.attached = True + + #Update SCSIid string + scsiutil.add_serial_record(self.session, self.sr_ref, \ + scsiutil.devlist_to_serialstring(self.root.split(','))) + + # Set the block scheduler + for dev in self.root.split(','): self.block_setscheduler(dev) + + def detach(self, sr_uuid): + super(XFSSR, self).detach(sr_uuid) + try: + # deactivate SR + cmd = ["lvchange", "-an", self.remotepath] + util.pread2(cmd) + except util.CommandException, inst: + raise xs_errors.XenError('LVMUnMount', \ + opterr='lvm -an failed errno is %d' % inst.code) + + def probe(self): + return lvutil.srlist_toxml(lvutil.scan_srlist(EXT_PREFIX, self.root), + EXT_PREFIX) + + def create(self, sr_uuid, size): + if self._checkmount(): + raise xs_errors.XenError('SRExists') + + # Check none of the devices already in use by other PBDs + if util.test_hostPBD_devs(self.session, sr_uuid, self.root): + raise xs_errors.XenError('SRInUse') + + # Check serial number entry in SR records + for dev in self.root.split(','): + if util.test_scsiserial(self.session, dev): + raise xs_errors.XenError('SRInUse') + + if not lvutil._checkVG(self.vgname): + lvutil.createVG(self.root, self.vgname) + + if lvutil._checkLV(self.remotepath): + raise xs_errors.XenError('SRExists') + + try: + numdevs = len(self.root.split(',')) + cmd = ["lvcreate", "-n", sr_uuid] + if numdevs > 1: + lowest = -1 + for dev in self.root.split(','): + stats = lvutil._getPVstats(dev) + if lowest < 0 or stats['freespace'] < lowest: + lowest = stats['freespace'] + size_mb = (lowest / (1024 * 1024)) * numdevs + + # Add stripe parameter to command + cmd += ["-i", str(numdevs), "-I", "2048"] + else: + stats = lvutil._getVGstats(self.vgname) + size_mb = stats['freespace'] / (1024 * 1024) + assert(size_mb > 0) + cmd += ["-L", str(size_mb), self.vgname] + text = util.pread(cmd) + + cmd = ["lvchange", "-ay", self.remotepath] + text = util.pread(cmd) + except util.CommandException, inst: + raise xs_errors.XenError('LVMCreate', \ + opterr='lv operation, error %d' % inst.code) + except AssertionError: + raise xs_errors.XenError('SRNoSpace', \ + opterr='Insufficient space in VG %s' % self.vgname) + + try: + util.pread2(["mkfs.xfs", self.remotepath]) + except util.CommandException, inst: + raise xs_errors.XenError('LVMFilesystem', \ + opterr='mkfs failed error %d' % inst.code) + + #Update serial number string + scsiutil.add_serial_record(self.session, self.sr_ref, \ + scsiutil.devlist_to_serialstring(self.root.split(','))) + + def vdi(self, uuid, loadLocked=False): + return XFSFileVDI(self, uuid) + + @staticmethod + def _is_xfs_available(): + import distutils.spawn + return distutils.spawn.find_executable('mkfs.xfs') + + +class XFSFileVDI(FileSR.FileVDI): + def attach(self, sr_uuid, vdi_uuid): + if not hasattr(self,'xenstore_data'): + self.xenstore_data = {} + + self.xenstore_data['storage-type'] = XFSSR.DRIVER_TYPE + + return super(XFSFileVDI, self).attach(sr_uuid, vdi_uuid) + + +if __name__ == '__main__': + SRCommand.run(XFSSR, DRIVER_INFO) +else: + SR.registerSR(XFSSR) diff --git a/drivers/ZFSSR.py b/drivers/ZFSSR.py new file mode 100644 index 00000000..b8032117 --- /dev/null +++ b/drivers/ZFSSR.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python +# +# Copyright (C) 2020 Vates SAS +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import SR +import SRCommand + +import FileSR + +import util +import xs_errors + +CAPABILITIES = [ + 'SR_UPDATE', + 'VDI_CREATE', + 'VDI_DELETE', + 'VDI_ATTACH', + 'VDI_DETACH', + 'VDI_CLONE', + 'VDI_SNAPSHOT', + 'VDI_RESIZE', + 'VDI_MIRROR', + 'VDI_GENERATE_CONFIG', + 'ATOMIC_PAUSE', + 'VDI_CONFIG_CBT', + 'VDI_ACTIVATE', + 'VDI_DEACTIVATE', + 'THIN_PROVISIONING' +] + +CONFIGURATION = [ + ['location', 'local ZFS directory path (required)'] +] + +DRIVER_INFO = { + 'name': 'Local ZFS VHD', + 'description': + 'SR plugin which represents disks as VHD files stored on a ZFS disk', + 'vendor': 'Vates SAS', + 'copyright': '(C) 2020 Vates SAS', + 'driver_version': '1.0', + 'required_api_version': '1.0', + 'capabilities': CAPABILITIES, + 'configuration': CONFIGURATION +} + + +def is_zfs_available(): + import distutils.spawn + return distutils.spawn.find_executable('zfs') and \ + util.pathexists('/sys/module/zfs/initstate') + + +def is_zfs_path(path): + cmd = ['findmnt', '-o', 'FSTYPE', '-n', path] + fs_type = util.pread2(cmd).split('\n')[0] + return fs_type == 'zfs' + + +class ZFSSR(FileSR.FileSR): + DRIVER_TYPE = 'zfs' + + @staticmethod + def handles(type): + return type == ZFSSR.DRIVER_TYPE + + def load(self, sr_uuid): + if not is_zfs_available(): + raise xs_errors.XenError( + 'SRUnavailable', + opterr='zfs is not installed or module is not loaded' + ) + return super(ZFSSR, self).load(sr_uuid) + + def create(self, sr_uuid, size): + if not is_zfs_path(self.remotepath): + raise xs_errors.XenError( + 'ZFSSRCreate', + opterr='Cannot create SR, path is not a ZFS mountpoint' + ) + return super(ZFSSR, self).create(sr_uuid, size) + + def delete(self, sr_uuid): + if not self._checkmount(): + raise xs_errors.XenError( + 'ZFSSRDelete', + opterr='ZFS SR is not mounted or uses an invalid FS type' + ) + return super(ZFSSR, self).delete(sr_uuid) + + def attach(self, sr_uuid): + if not is_zfs_path(self.remotepath): + raise xs_errors.XenError( + 'SRUnavailable', + opterr='Invalid ZFS path' + ) + return super(ZFSSR, self).attach(sr_uuid) + + def detach(self, sr_uuid): + return super(ZFSSR, self).detach(sr_uuid) + + def vdi(self, uuid, loadLocked=False): + return ZFSFileVDI(self, uuid) + + # Ensure _checkmount is overridden to prevent bad behaviors in FileSR. + def _checkmount(self): + return super(ZFSSR, self)._checkmount() and \ + is_zfs_path(self.remotepath) + + +class ZFSFileVDI(FileSR.FileVDI): + def attach(self, sr_uuid, vdi_uuid): + if not hasattr(self, 'xenstore_data'): + self.xenstore_data = {} + + self.xenstore_data['storage-type'] = ZFSSR.DRIVER_TYPE + + return super(ZFSFileVDI, self).attach(sr_uuid, vdi_uuid) + + +if __name__ == '__main__': + SRCommand.run(ZFSSR, DRIVER_INFO) +else: + SR.registerSR(ZFSSR) diff --git a/drivers/blktap2.py b/drivers/blktap2.py index e1f75e9f..ea19cca1 100755 --- a/drivers/blktap2.py +++ b/drivers/blktap2.py @@ -50,6 +50,12 @@ from socket import socket, AF_UNIX, SOCK_STREAM from httplib import HTTP, HTTPConnection +try: + from linstorvolumemanager import log_drbd_openers + LINSTOR_AVAILABLE = True +except ImportError: + LINSTOR_AVAILABLE = False + PLUGIN_TAP_PAUSE = "tapdisk-pause" SOCKPATH = "/var/xapi/xcp-rrdd" @@ -817,7 +823,22 @@ def launch_on_tap(cls, blktap, path, _type, options): TapCtl.attach(pid, minor) try: - TapCtl.open(pid, minor, _type, path, options) + retry_open = 0 + while True: + try: + TapCtl.open(pid, minor, _type, path, options) + except TapCtl.CommandFailure as e: + err = ( + 'status' in e.info and e.info['status'] + ) or None + if err in (errno.EIO, errno.EROFS, errno.EAGAIN): + if retry_open < 5: + retry_open += 1 + time.sleep(1) + continue + if LINSTOR_AVAILABLE and err == errno.EROFS: + log_drbd_openers(path) + break try: tapdisk = cls.__from_blktap(blktap) node = '/sys/dev/block/%d:%d' % (tapdisk.major(), tapdisk.minor) diff --git a/drivers/cleanup.py b/drivers/cleanup.py index 97c332c8..346180f5 100755 --- a/drivers/cleanup.py +++ b/drivers/cleanup.py @@ -47,7 +47,19 @@ from refcounter import RefCounter from ipc import IPCFlag from lvmanager import LVActivator -from srmetadata import LVMMetadataHandler +from srmetadata import LVMMetadataHandler, VDI_TYPE_TAG + +try: + from linstorjournaler import LinstorJournaler + from linstorvhdutil import LinstorVhdUtil + from linstorvolumemanager import get_controller_uri + from linstorvolumemanager import LinstorVolumeManager + from linstorvolumemanager import LinstorVolumeManagerError + + LINSTOR_AVAILABLE = True +except ImportError: + LINSTOR_AVAILABLE = False + # Disable automatic leaf-coalescing. Online leaf-coalesce is currently not # possible due to lvhd_stop_using_() not working correctly. However, we leave @@ -439,7 +451,7 @@ def srUpdate(self): # # VDI # -class VDI: +class VDI(object): """Object representing a VDI of a VHD-based SR""" POLL_INTERVAL = 1 @@ -643,7 +655,19 @@ def getAllPrunable(self): if child not in childList: thisPrunable = False - if not self.scanError and thisPrunable: + # We can destroy the current VDI if all childs are hidden BUT the + # current VDI must be hidden too to do that! + # Example in this case (after a failed live leaf coalesce): + # + # SMGC: [32436] SR 07ed ('linstor-nvme-sr') (2 VDIs in 1 VHD trees): + # SMGC: [32436] b5458d61(1.000G/4.127M) + # SMGC: [32436] *OLD_b545(1.000G/4.129M) + # + # OLD_b545 is hidden and must be removed, but b5458d61 not. + # Normally we are not in this function when the delete action is + # executed but in `_liveLeafCoalesce`. + + if not self.scanError and not self.hidden and thisPrunable: vdiList.append(self) return vdiList @@ -702,6 +726,12 @@ def delete(self): lock.Lock.cleanupAll(self.uuid) self._clear() + def getParent(self): + return vhdutil.getParent(self.path, lambda x: x.strip()) + + def repair(self, parent): + vhdutil.repair(parent) + def __str__(self): strHidden = "" if self.hidden: @@ -816,12 +846,14 @@ def _reportCoalesceError(vdi, ce): xapi.message.create(msg_name, "3", "SR", vdi.sr.uuid, msg_body) _reportCoalesceError = staticmethod(_reportCoalesceError) + def coalesce(self): + vhdutil.coalesce(self.path) + def _doCoalesceVHD(vdi): try: - startTime = time.time() vhdSize = vdi.getSizeVHD() - vhdutil.coalesce(vdi.path) + vdi.coalesce() endTime = time.time() vdi.sr.recordStorageSpeed(startTime, endTime, vhdSize) except util.CommandException, ce: @@ -850,11 +882,11 @@ def _coalesceVHD(self, timeOut): # Try a repair and reraise the exception parent = "" try: - parent = vhdutil.getParent(self.path, lambda x: x.strip()) + parent = self.getParent() # Repair error is logged and ignored. Error reraised later util.SMlog('Coalesce failed on %s, attempting repair on ' \ 'parent %s' % (self.uuid, parent)) - vhdutil.repair(parent) + self.repair(parent) except Exception, e: util.SMlog('(error ignored) Failed to repair parent %s ' \ 'after failed coalesce on %s, err: %s' % @@ -1347,12 +1379,136 @@ def _calcExtraSpaceForSnapshotCoalescing(self): lvhdutil.calcSizeLV(self.getSizeVHD()) +class LinstorVDI(VDI): + """Object representing a VDI in a LINSTOR SR""" + + MAX_SIZE = 2 * 1024 * 1024 * 1024 * 1024 # Max VHD size. + + VOLUME_LOCK_TIMEOUT = 30 + + def load(self, info=None): + self.parentUuid = info.parentUuid + self.scanError = True + self.parent = None + self.children = [] + + self.fileName = self.sr._linstor.get_volume_name(self.uuid) + self.path = self.sr._linstor.build_device_path(self.fileName) + + if not info: + try: + info = self.sr._vhdutil.get_vhd_info(self.uuid) + except util.SMException: + Util.log( + ' [VDI {}: failed to read VHD metadata]'.format(self.uuid) + ) + return + + self.parentUuid = info.parentUuid + self.sizeVirt = info.sizeVirt + self._sizeVHD = info.sizePhys + self.hidden = info.hidden + self.scanError = False + + def rename(self, uuid): + Util.log('Renaming {} -> {} (path={})'.format( + self.uuid, uuid, self.path + )) + self.sr._linstor.update_volume_uuid(self.uuid, uuid) + VDI.rename(self, uuid) + + def delete(self): + if len(self.children) > 0: + raise util.SMException( + 'VDI {} has children, can\'t delete'.format(self.uuid) + ) + self.sr.lock() + try: + self.sr._linstor.destroy_volume(self.uuid) + self.sr.forgetVDI(self.uuid) + finally: + self.sr.unlock() + VDI.delete(self) + + def validate(self, fast=False): + if not self.sr._vhdutil.check(self.uuid, fast=fast): + raise util.SMException('VHD {} corrupted'.format(self)) + + def pause(self, failfast=False): + self.sr._linstor.ensure_volume_is_not_locked( + self.uuid, timeout=self.VOLUME_LOCK_TIMEOUT + ) + return super(LinstorVDI, self).pause(failfast) + + def coalesce(self): + self.sr._vhdutil.force_coalesce(self.path) + + def getParent(self): + return self.sr._vhdutil.get_parent( + self.sr._linstor.get_volume_uuid_from_device_path(self.path) + ) + + def repair(self, parent_uuid): + self.sr._vhdutil.force_repair( + self.sr._linstor.get_device_path(parent_uuid) + ) + + def _relinkSkip(self): + abortFlag = IPCFlag(self.sr.uuid) + for child in self.children: + if abortFlag.test(FLAG_TYPE_ABORT): + raise AbortException('Aborting due to signal') + Util.log( + ' Relinking {} from {} to {}'.format( + child, self, self.parent + ) + ) + + session = child.sr.xapi.session + sr_uuid = child.sr.uuid + vdi_uuid = child.uuid + try: + self.sr._linstor.ensure_volume_is_not_locked( + vdi_uuid, timeout=self.VOLUME_LOCK_TIMEOUT + ) + blktap2.VDI.tap_pause(session, sr_uuid, vdi_uuid) + child._setParent(self.parent) + finally: + blktap2.VDI.tap_unpause(session, sr_uuid, vdi_uuid) + self.children = [] + + def _setParent(self, parent): + self.sr._vhdutil.force_parent(self.path, parent.path) + self.parent = parent + self.parentUuid = parent.uuid + parent.children.append(self) + try: + self.setConfig(self.DB_VHD_PARENT, self.parentUuid) + Util.log("Updated the vhd-parent field for child %s with %s" % \ + (self.uuid, self.parentUuid)) + except: + Util.log("Failed to update %s with vhd-parent field %s" % \ + (self.uuid, self.parentUuid)) + + def _setHidden(self, hidden=True): + HIDDEN_TAG = 'hidden' + + if self.raw: + self.sr._linstor.update_volume_metadata(self.uuid, { + HIDDEN_TAG: hidden + }) + self.hidden = hidden + else: + VDI._setHidden(self, hidden) + + def _queryVHDBlocks(self): + return self.sr._vhdutil.get_block_bitmap(self.uuid) ################################################################################ # # SR # -class SR: +class SR(object): class LogFilter: def __init__(self, sr): self.sr = sr @@ -1403,7 +1559,8 @@ def _getTreeStr(self, vdi, indent = 8): TYPE_FILE = "file" TYPE_LVHD = "lvhd" - TYPES = [TYPE_LVHD, TYPE_FILE] + TYPE_LINSTOR = "linstor" + TYPES = [TYPE_LVHD, TYPE_FILE, TYPE_LINSTOR] LOCK_RETRY_INTERVAL = 3 LOCK_RETRY_ATTEMPTS = 20 @@ -1424,6 +1581,8 @@ def getInstance(uuid, xapiSession, createLock = True, force = False): return FileSR(uuid, xapi, createLock, force) elif type == SR.TYPE_LVHD: return LVHDSR(uuid, xapi, createLock, force) + elif type == SR.TYPE_LINSTOR: + return LinstorSR(uuid, xapi, createLock, force) raise util.SMException("SR type %s not recognized" % type) getInstance = staticmethod(getInstance) @@ -2730,6 +2889,246 @@ def _updateSlavesOnResize(self, vdi): vdi.fileName, vdi.uuid, slaves) +class LinstorSR(SR): + TYPE = SR.TYPE_LINSTOR + + def __init__(self, uuid, xapi, createLock, force): + if not LINSTOR_AVAILABLE: + raise util.SMException( + 'Can\'t load cleanup LinstorSR: LINSTOR libraries are missing' + ) + + SR.__init__(self, uuid, xapi, createLock, force) + self.path = LinstorVolumeManager.DEV_ROOT_PATH + self._reloadLinstor() + + def deleteVDI(self, vdi): + self._checkSlaves(vdi) + SR.deleteVDI(self, vdi) + + def getFreeSpace(self): + return self._linstor.max_volume_size_allowed + + def scan(self, force=False): + all_vdi_info = self._scan(force) + for uuid, vdiInfo in all_vdi_info.iteritems(): + # When vdiInfo is None, the VDI is RAW. + vdi = self.getVDI(uuid) + if not vdi: + self.logFilter.logNewVDI(uuid) + vdi = LinstorVDI(self, uuid, not vdiInfo) + self.vdis[uuid] = vdi + if vdiInfo: + vdi.load(vdiInfo) + self._removeStaleVDIs(all_vdi_info.keys()) + self._buildTree(force) + self.logFilter.logState() + self._handleInterruptedCoalesceLeaf() + + def pauseVDIs(self, vdiList): + self._linstor.ensure_volume_list_is_not_locked( + vdiList, timeout=LinstorVDI.VOLUME_LOCK_TIMEOUT + ) + return super(LinstorSR, self).pauseVDIs(vdiList) + + def _reloadLinstor(self): + session = self.xapi.session + host_ref = util.get_this_host_ref(session) + sr_ref = session.xenapi.SR.get_by_uuid(self.uuid) + + pbd = util.find_my_pbd(session, host_ref, sr_ref) + if pbd is None: + raise util.SMException('Failed to find PBD') + + dconf = session.xenapi.PBD.get_device_config(pbd) + group_name = dconf['group-name'] + + controller_uri = get_controller_uri() + self.journaler = LinstorJournaler( + controller_uri, group_name, logger=util.SMlog + ) + + self._linstor = LinstorVolumeManager( + controller_uri, + group_name, + repair=True, + logger=util.SMlog + ) + self._vhdutil = LinstorVhdUtil(session, self._linstor) + + def _scan(self, force): + for i in range(SR.SCAN_RETRY_ATTEMPTS): + self._reloadLinstor() + error = False + try: + all_vdi_info = self._load_vdi_info() + for uuid, vdiInfo in all_vdi_info.iteritems(): + if vdiInfo and vdiInfo.error: + error = True + break + if not error: + return all_vdi_info + Util.log('Scan error, retrying ({})'.format(i)) + except Exception as e: + Util.log('Scan exception, retrying ({}): {}'.format(i, e)) + Util.log(traceback.format_exc()) + + if force: + return all_vdi_info + raise util.SMException('Scan error') + + def _load_vdi_info(self): + all_vdi_info = {} + + # TODO: Ensure metadata contains the right info. + + all_volume_info = self._linstor.get_volumes_with_info() + volumes_metadata = self._linstor.get_volumes_with_metadata() + for vdi_uuid, volume_info in all_volume_info.items(): + try: + if not volume_info.name and \ + not list(volumes_metadata[vdi_uuid].items()): + continue # Ignore it, probably deleted. + + vdi_type = volumes_metadata[vdi_uuid].get(VDI_TYPE_TAG) + if vdi_type == vhdutil.VDI_TYPE_VHD: + info = self._vhdutil.get_vhd_info(vdi_uuid) + elif not vdi_uuid.startswith('DELETED_'): + # Ensure it's not a VHD... + try: + info = self._vhdutil.get_vhd_info(vdi_uuid) + except: + try: + self.repair(vdi_uuid) + info = self._vhdutil.get_vhd_info(vdi_uuid) + except: + info = None + else: + # Assume it's really a RAW volume of a failed snap without VHD header/footer. + info = None + except Exception as e: + Util.log( + ' [VDI {}: failed to load VDI info]: {}' + .format(vdi_uuid, e) + ) + info = vhdutil.VHDInfo(vdi_uuid) + info.error = 1 + all_vdi_info[vdi_uuid] = info + return all_vdi_info + + # TODO: Maybe implement _liveLeafCoalesce/_prepareCoalesceLeaf/ + # _finishCoalesceLeaf/_updateSlavesOnResize like LVM plugin. + + def _calcExtraSpaceNeeded(self, child, parent): + meta_overhead = vhdutil.calcOverheadEmpty(LinstorVDI.MAX_SIZE) + bitmap_overhead = vhdutil.calcOverheadBitmap(parent.sizeVirt) + virtual_size = LinstorVolumeManager.round_up_volume_size( + parent.sizeVirt + meta_overhead + bitmap_overhead + ) + volume_size = self._linstor.get_volume_size(parent.uuid) + return virtual_size - volume_size + + def _hasValidDevicePath(self, uuid): + try: + self._linstor.get_device_path(uuid) + except Exception: + # TODO: Maybe log exception. + return False + return True + + def _liveLeafCoalesce(self, vdi): + self.lock() + try: + self._linstor.ensure_volume_is_not_locked( + vdi.uuid, timeout=LinstorVDI.VOLUME_LOCK_TIMEOUT + ) + return super(LinstorSR, self)._liveLeafCoalesce(vdi) + finally: + self.unlock() + + def _prepareCoalesceLeaf(self, vdi): + # Move diskless path if necessary. We must have an access + # to modify locally the volume. + self._linstor.get_device_path(vdi.uuid) + + def _handleInterruptedCoalesceLeaf(self): + entries = self.journaler.get_all(VDI.JRN_LEAF) + for uuid, parentUuid in entries.iteritems(): + if self._hasValidDevicePath(parentUuid) or \ + self._hasValidDevicePath(self.TMP_RENAME_PREFIX + uuid): + self._undoInterruptedCoalesceLeaf(uuid, parentUuid) + else: + self._finishInterruptedCoalesceLeaf(uuid, parentUuid) + self.journaler.remove(VDI.JRN_LEAF, uuid) + vdi = self.getVDI(uuid) + if vdi: + vdi.ensureUnpaused() + + def _undoInterruptedCoalesceLeaf(self, childUuid, parentUuid): + Util.log('*** UNDO LEAF-COALESCE') + parent = self.getVDI(parentUuid) + if not parent: + parent = self.getVDI(childUuid) + if not parent: + raise util.SMException( + 'Neither {} nor {} found'.format(parentUuid, childUuid) + ) + Util.log( + 'Renaming parent back: {} -> {}'.format(childUuid, parentUuid) + ) + parent.rename(parentUuid) + + child = self.getVDI(childUuid) + if not child: + child = self.getVDI(self.TMP_RENAME_PREFIX + childUuid) + if not child: + raise util.SMException( + 'Neither {} nor {} found'.format( + childUuid, self.TMP_RENAME_PREFIX + childUuid + ) + ) + Util.log('Renaming child back to {}'.format(childUuid)) + child.rename(childUuid) + Util.log('Updating the VDI record') + child.setConfig(VDI.DB_VHD_PARENT, parentUuid) + child.setConfig(VDI.DB_VDI_TYPE, vhdutil.VDI_TYPE_VHD) + + # TODO: Maybe deflate here. + + if child.hidden: + child._setHidden(False) + if not parent.hidden: + parent._setHidden(True) + self._updateSlavesOnUndoLeafCoalesce(parent, child) + Util.log('*** leaf-coalesce undo successful') + + def _finishInterruptedCoalesceLeaf(self, childUuid, parentUuid): + Util.log('*** FINISH LEAF-COALESCE') + vdi = self.getVDI(childUuid) + if not vdi: + raise util.SMException('VDI {} not found'.format(childUuid)) + # TODO: Maybe inflate. + try: + self.forgetVDI(parentUuid) + except XenAPI.Failure: + pass + self._updateSlavesOnResize(vdi) + Util.log('*** finished leaf-coalesce successfully') + + def _checkSlaves(self, vdi): + try: + all_openers = self._linstor.get_volume_openers(vdi.uuid) + for openers in all_openers.itervalues(): + for opener in openers.values(): + if opener['process-name'] != 'tapdisk': + raise util.SMException( + 'VDI {} is in use: {}'.format(vdi.uuid, all_openers) + ) + except LinstorVolumeManagerError as e: + if e.code != LinstorVolumeManagerError.ERR_VOLUME_NOT_EXISTS: + raise + + ################################################################################ # # Helpers @@ -2765,9 +3164,14 @@ def normalizeType(type): if type in ["lvm", "lvmoiscsi", "lvmohba", "lvmofcoe"]: # temporary while LVHD is symlinked as LVM type = SR.TYPE_LVHD - if type in ["ext", "nfs", "ocfsoiscsi", "ocfsohba", "smb"]: + if type in [ + "ext", "nfs", "ocfsoiscsi", "ocfsohba", "smb", "cephfs", "glusterfs", + "moosefs", "xfs", "zfs", "ext4" + ]: type = SR.TYPE_FILE - if not type in SR.TYPES: + if type in ["linstor"]: + type = SR.TYPE_LINSTOR + if type not in SR.TYPES: raise util.SMException("Unsupported SR type: %s" % type) return type diff --git a/drivers/linstor-manager b/drivers/linstor-manager new file mode 100755 index 00000000..45201eed --- /dev/null +++ b/drivers/linstor-manager @@ -0,0 +1,989 @@ +#!/usr/bin/env python +# +# Copyright (C) 2020 Vates SAS - ronan.abhamon@vates.fr +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +# We must modify default import path, we don't want to import modules +# installed in plugins folder and instead we must import from LINSTOR driver +# folder. +import sys +sys.path[0] = '/opt/xensource/sm/' + +import base64 +import distutils.util +import os +import socket +import XenAPI +import XenAPIPlugin + +from linstorjournaler import LinstorJournaler +from linstorvolumemanager import get_controller_uri, get_local_volume_openers, LinstorVolumeManager +from lock import Lock +import json +import LinstorSR +import re +import util +import vhdutil + +BACKING_DISK_RE = re.compile('^/dev/([^/]+)/(?:[^/]+)$') +LVM_PLUGIN = 'lvm.py' +THIN_POOL = 'thin_pool' + +FIREWALL_PORT_SCRIPT = '/etc/xapi.d/plugins/firewall-port' +LINSTOR_PORTS = [3366, 3370, 3376, 3377, 8076, 8077] +DRBD_PORTS = '7000:8000' + +DRBD_REACTOR_CONF = '/etc/drbd-reactor.d/sm-linstor.toml' + +DRBD_REACTOR_CONF_CONTENT = """[[promoter]] + +[promoter.resources.xcp-persistent-database] +start = [ "var-lib-linstor.service", "linstor-controller.service" ] +""" + +DRBD_REACTOR_DEPS = [ + '/run/systemd/system/linstor-controller.service.d/reactor.conf', + '/run/systemd/system/var-lib-linstor.service.d/reactor.conf' +] + + +def update_linstor_port(port, open_ports): + fn = 'open' if open_ports else 'close' + args = ( + FIREWALL_PORT_SCRIPT, fn, str(port), 'tcp' + ) + + (ret, out, err) = util.doexec(args) + if ret == 0: + return + raise Exception('Failed to {} port: {} {}'.format(fn, out, err)) + + +def has_iptables_rule(rule): + (ret, stdout, stderr) = util.doexec(['iptables', '-C'] + rule) + return not ret + + +def update_drbd_ports(open_ports): + # We want to use a static rule regarding DRBD volumes, + # so we can't use the XAPI firewall port script, we have to manually + # check for existing rules before updating iptables service. + rule = ['INPUT', '-p', 'tcp', '--dport', DRBD_PORTS, '-j', 'ACCEPT'] + if open_ports == has_iptables_rule(rule): + return + if open_ports: + rule.insert(1, '1') + (ret, stdout, stderr) = util.doexec(['iptables', '-I'] + rule) + if ret: + raise Exception('Failed to add DRBD rule: {}'.format(stderr)) + else: + (ret, stdout, stderr) = util.doexec(['iptables', '-D'] + rule) + if ret: + raise Exception('Failed to remove DRBD rule: {}'.format(stderr)) + (ret, stdout, stderr) = util.doexec(['service', 'iptables', 'save']) + if ret: + raise Exception('Failed to save DRBD rule: {}'.format(stderr)) + + +def update_all_ports(open_ports): + for port in LINSTOR_PORTS: + update_linstor_port(port, open_ports) + update_drbd_ports(open_ports) + + +def update_linstor_satellite_service(start): + service = 'linstor-satellite' + + # Stop services in all cases first. + # Ensure we don't have an invalid cache used by a satellite. + # (We found an issue with a new added disk which used a volume group name + # formerly involved by another disk. To avoid this kind of problem, we + # always restart the satellite.) + util.enable_and_start_service(service, False) + if start: + util.enable_and_start_service(service, True) + + +def update_drbd_reactor_service(start): + if start: + util.atomicFileWrite(DRBD_REACTOR_CONF, None, DRBD_REACTOR_CONF_CONTENT) + else: + try: + os.remove(DRBD_REACTOR_CONF) + except Exception: + pass + + util.stop_service('drbd-reactor') + + try: + util.stop_service('drbd-promote@xcp\x2dpersistent\x2ddatabase.service') + except Exception as e: + if str(e).rstrip().endswith(' not loaded.'): + pass + raise e + + util.stop_service('linstor-controller') + util.stop_service('var-lib-linstor.service') + + for dep in DRBD_REACTOR_DEPS: + try: + os.remove(dep) + except Exception: + pass + + util.doexec(['systemctl', 'daemon-reload']) + util.enable_and_start_service('drbd-reactor', start) + + +def exec_create_sr(session, name, description, disks, volume_group, redundancy, provisioning, force): + disk_hostnames = disks.keys() + thin = provisioning == 'thin' + + # Create volumes. + hosts = session.xenapi.host.get_all_records() + hostnames = [] + for host_ref, host_record in hosts.items(): + hostname = host_record['hostname'] + hostnames.append(hostname) + + if force: + try: + session.xenapi.host.call_plugin( + host_ref, LVM_PLUGIN, 'destroy_volume_group', { + 'vg_name': volume_group, + 'force': 'True' + } + ) + except Exception as e: + try: + response = session.xenapi.host.call_plugin( + host_ref, LVM_PLUGIN, 'list_volume_groups', { + 'vg_name': volume_group + } + ) + if response != '{}': + raise e + except Exception: + raise e + + if hostname not in disk_hostnames or not disks[hostname]: + if force or session.xenapi.host.call_plugin( + host_ref, LVM_PLUGIN, 'list_volume_groups', { + 'vg_name': volume_group + } + ) == '{}': + continue + raise Exception('Volume group should not exist on `{}`, you must remove it manually'.format(hostname)) + + host_disks = disks[hostname] + if type(host_disks) is list: + host_disks = ','.join(disks[hostname]) + else: + raise Exception('Disk value of `{}` must be a disk list'.format(hostname)) + + session.xenapi.host.call_plugin( + host_ref, LVM_PLUGIN, 'create_physical_volume', { + 'devices': host_disks, + 'force': str(force) + } + ) + + session.xenapi.host.call_plugin( + host_ref, LVM_PLUGIN, 'create_volume_group', { + 'vg_name': volume_group, + 'devices': host_disks + } + ) + + if thin: + session.xenapi.host.call_plugin( + host_ref, LVM_PLUGIN, 'create_thin_pool', { + 'vg_name': volume_group, + 'lv_name': THIN_POOL + } + ) + + # Create SR. + master_ref = session.xenapi.pool.get_all_records().values()[0]['master'] + + device_config = { + 'redundancy': str(redundancy), + 'provisioning': 'thin' if thin else 'thick', + 'group-name': '{}/{}'.format(volume_group, THIN_POOL) if thin else volume_group, + 'hosts': ','.join(hostnames), + 'monitor-db-quorum': str(len(hostnames) > 2) + } + sr_ref = session.xenapi.SR.create( + master_ref, device_config, '0', name, description, 'linstor', '', True, {} + ) + return session.xenapi.SR.get_uuid(sr_ref) + + +def get_drbd_volumes(volume_group=None): + drbd_volumes = {} + (ret, stdout, stderr) = util.doexec(['drbdsetup', 'show', '--json']) + if ret: + raise Exception('Failed to get JSON object: {}'.format(stderr)) + + config = json.loads(stdout) + for resource in config: + for volume in resource['_this_host']['volumes']: + backing_disk = volume['backing-disk'] + match = BACKING_DISK_RE.match(backing_disk) + if not match: + continue + + cur_volume_group = match.groups()[0] + if volume_group and cur_volume_group != volume_group: + continue + + minor = int(volume['device_minor']) + if cur_volume_group in drbd_volumes: + drbd_volumes[cur_volume_group].append(minor) + else: + drbd_volumes[cur_volume_group] = [minor] + return drbd_volumes + + +def force_destroy_drbd_volume(minor): + (ret, stdout, stderr) = util.doexec(['drbdsetup', 'detach', minor, '--force']) + if ret: + raise Exception('Failed to detach volume: {}'.format(stderr)) + (ret, stdout, stderr) = util.doexec(['drbdsetup', 'del-minor', minor]) + if ret: + raise Exception('Failed to destroy volume: {}'.format(stderr)) + +# ------------------------------------------------------------------------------ + + +def prepare_sr(session, args): + try: + LinstorSR.activate_lvm_group(args['groupName']) + + update_all_ports(open_ports=True) + # We don't want to enable and start drbd-reactor daemon during + # SR creation. + update_drbd_reactor_service(start=False) + update_linstor_satellite_service(start=True) + return str(True) + except Exception as e: + util.SMlog('linstor-manager:prepare_sr error: {}'.format(e)) + return str(False) + + +def release_sr(session, args): + try: + update_linstor_satellite_service(start=False) + update_drbd_reactor_service(start=False) + update_all_ports(open_ports=False) + return str(True) + except Exception as e: + util.SMlog('linstor-manager:release_sr error: {}'.format(e)) + return str(False) + + +def update_drbd_reactor(session, args): + try: + enabled = distutils.util.strtobool(args['enabled']) + update_drbd_reactor_service(start=enabled) + return str(True) + except Exception as e: + util.SMlog( + 'linstor-manager:update_drbd_reactor error: {}'.format(e) + ) + return str(False) + + +def attach(session, args): + try: + sr_uuid = args['srUuid'] + vdi_uuid = args['vdiUuid'] + group_name = args['groupName'] + + controller_uri = get_controller_uri() + journaler = LinstorJournaler( + controller_uri, group_name, logger=util.SMlog + ) + linstor = LinstorVolumeManager( + controller_uri, + group_name, + logger=util.SMlog + ) + LinstorSR.attach_thin(session, journaler, linstor, sr_uuid, vdi_uuid) + return str(True) + except Exception as e: + util.SMlog('linstor-manager:attach error: {}'.format(e)) + return str(False) + + +def detach(session, args): + try: + sr_uuid = args['srUuid'] + vdi_uuid = args['vdiUuid'] + group_name = args['groupName'] + + linstor = LinstorVolumeManager( + get_controller_uri(), + group_name, + logger=util.SMlog + ) + LinstorSR.detach_thin(session, linstor, sr_uuid, vdi_uuid) + return str(True) + except Exception as e: + util.SMlog('linstor-manager:detach error: {}'.format(e)) + return str(False) + + +def destroy(session, args): + try: + group_name = args['groupName'] + + # When destroy is called, there are no running drbd-reactor daemons. + # So the controllers are stopped too, we must start an instance. + util.restart_service('var-lib-linstor.service') + util.restart_service('linstor-controller') + + linstor = LinstorVolumeManager( + 'linstor://localhost', + group_name, + logger=util.SMlog + ) + linstor.destroy() + return str(True) + except Exception as e: + util.stop_service('linstor-controller') + util.stop_service('var-lib-linstor.service') + util.SMlog('linstor-manager:destroy error: {}'.format(e)) + return str(False) + + +def check(session, args): + try: + device_path = args['devicePath'] + ignore_missing_footer = distutils.util.strtobool( + args['ignoreMissingFooter'] + ) + fast = distutils.util.strtobool(args['fast']) + return str(vhdutil.check(device_path, ignore_missing_footer, fast)) + except Exception as e: + util.SMlog('linstor-manager:check error: {}'.format(e)) + raise + + +def get_vhd_info(session, args): + try: + device_path = args['devicePath'] + group_name = args['groupName'] + include_parent = distutils.util.strtobool(args['includeParent']) + + linstor = LinstorVolumeManager( + get_controller_uri(), + group_name, + logger=util.SMlog + ) + + def extract_uuid(device_path): + # TODO: Remove new line in the vhdutil module. Not here. + return linstor.get_volume_uuid_from_device_path( + device_path.rstrip('\n') + ) + + vhd_info = vhdutil.getVHDInfo( + device_path, extract_uuid, include_parent, False + ) + return json.dumps(vhd_info.__dict__) + except Exception as e: + util.SMlog('linstor-manager:get_vhd_info error: {}'.format(e)) + raise + + +def has_parent(session, args): + try: + device_path = args['devicePath'] + return str(vhdutil.hasParent(device_path)) + except Exception as e: + util.SMlog('linstor-manager:has_parent error: {}'.format(e)) + raise + + +def get_parent(session, args): + try: + device_path = args['devicePath'] + group_name = args['groupName'] + + linstor = LinstorVolumeManager( + get_controller_uri(), + group_name, + logger=util.SMlog + ) + + def extract_uuid(device_path): + # TODO: Remove new line in the vhdutil module. Not here. + return linstor.get_volume_uuid_from_device_path( + device_path.rstrip('\n') + ) + + return vhdutil.getParent(device_path, extract_uuid) + except Exception as e: + util.SMlog('linstor-manager:get_parent error: {}'.format(e)) + raise + + +def get_size_virt(session, args): + try: + device_path = args['devicePath'] + return str(vhdutil.getSizeVirt(device_path)) + except Exception as e: + util.SMlog('linstor-manager:get_size_virt error: {}'.format(e)) + raise + + +def get_size_phys(session, args): + try: + device_path = args['devicePath'] + return str(vhdutil.getSizePhys(device_path)) + except Exception as e: + util.SMlog('linstor-manager:get_size_phys error: {}'.format(e)) + raise + + +def get_depth(session, args): + try: + device_path = args['devicePath'] + return str(vhdutil.getDepth(device_path)) + except Exception as e: + util.SMlog('linstor-manager:get_depth error: {}'.format(e)) + raise + + +def get_key_hash(session, args): + try: + device_path = args['devicePath'] + return vhdutil.getKeyHash(device_path) or '' + except Exception as e: + util.SMlog('linstor-manager:get_key_hash error: {}'.format(e)) + raise + + +def get_block_bitmap(session, args): + try: + device_path = args['devicePath'] + return base64.b64encode(vhdutil.getBlockBitmap(device_path)) or '' + except Exception as e: + util.SMlog('linstor-manager:get_block_bitmap error: {}'.format(e)) + raise + + +def set_parent(session, args): + try: + device_path = args['devicePath'] + parent_path = args['parentPath'] + vhdutil.setParent(device_path, parent_path, False) + return '' + except Exception as e: + util.SMlog('linstor-manager:set_parent error: {}'.format(e)) + raise + + +def coalesce(session, args): + try: + device_path = args['devicePath'] + vhdutil.coalesce(device_path) + return '' + except Exception as e: + util.SMlog('linstor-manager:coalesce error: {}'.format(e)) + raise + + +def repair(session, args): + try: + device_path = args['devicePath'] + vhdutil.repair(device_path) + return '' + except Exception as e: + util.SMlog('linstor-manager:repair error: {}'.format(e)) + raise + + +def lock_vdi(session, args): + lock = None + try: + sr_uuid = args['srUuid'] + vdi_uuid = args['vdiUuid'] + group_name = args['groupName'] + locked = distutils.util.strtobool(args['locked']) + + # We must lock to mark the VDI. + lock = Lock(vhdutil.LOCK_TYPE_SR, sr_uuid) + if locked: + lock.acquire() + + linstor = LinstorVolumeManager( + get_controller_uri(), + group_name, + logger=util.SMlog + ) + linstor.lock_volume(vdi_uuid, locked) + + return str(True) + except Exception as e: + util.SMlog('linstor-manager:lock_vdi error: {}'.format(e)) + finally: + if locked and lock: + lock.release() + return str(False) + + +def has_controller_running(session, args): + (ret, stdout, stderr) = util.doexec([ + 'systemctl', 'is-active', '--quiet', 'linstor-controller' + ]) + return str(ret == 0) + + +def add_host(session, args): + group_name = args['groupName'] + + # 1. Find all LINSTOR SRs and PBDs. + srs = dict() + for sr_ref, sr in session.xenapi.SR.get_all_records().items(): + if sr.get('type') == 'linstor': + srs[sr_ref] = sr + + pbds = dict() + for pbd_ref, pbd in session.xenapi.PBD.get_all_records().items(): + device_config = pbd.get('device_config') + if ( + device_config and + device_config.get('group-name') == group_name + and pbd['SR'] in srs + ): + pbds[pbd_ref] = pbd + + # 2. Ensure there is at least one PBD and all PBDs are used in + # the same SR. + if not pbds: + raise Exception( + 'Failed to find PBDs of group `{}`'.format(group_name) + ) + + sr_ref = None + for pbd in pbds.values(): + if not sr_ref: + sr_ref = pbd['SR'] + elif pbd['SR'] != sr_ref: + raise Exception( + 'Group `{}` is used by many SRs!'.format(group_name) + ) + + # 3. Ensure node doesn't exist. + linstor = LinstorVolumeManager( + get_controller_uri(), + group_name, + logger=util.SMlog + ) + + node_name = socket.gethostname() + has_node = linstor.has_node(node_name) + + new_pbd_ref = None + + try: + # 4. Enable services. + update_all_ports(open_ports=True) + update_drbd_reactor_service(start=True) + update_linstor_satellite_service(start=True) + + # 5. Try to create local node. + if not has_node: + linstor.create_node(node_name, util.get_this_host_address(session)) + + # 6. Try to create PBD. + this_host = util.get_this_host_ref(session) + create_new_pbd = True + + assert pbds + for pbd in pbds.values(): + if pbd['host'] == this_host: + create_new_pbd = False + break + + device_config = pbd['device_config'] + # Should be the same on all hosts. + provisioning = device_config['provisioning'] + + # 7. Create new PBD. + if create_new_pbd: + new_pbd_ref = session.xenapi.PBD.create({ + 'host': this_host, + 'SR': sr_ref, + 'device_config': { + 'group-name': group_name, + 'redundancy': linstor.redundancy, + 'provisioning': provisioning + } + }) + try: + session.xenapi.PBD.plug(new_pbd_ref) + except Exception as e: + util.SMlog('Failed to plug new PBD: {}'.format(e)) + + return str(True) + except Exception as e: + stop_services = not has_node + if stop_services: + try: + linstor.destroy_node(node_name) + except Exception: + pass + + if new_pbd_ref: + try: + session.xenapi.PBD.unplug(new_pbd_ref) + except Exception: + pass + + try: + session.xenapi.PBD.destroy(new_pbd_ref) + except Exception: + pass + + try: + # If we failed to remove the node, we don't stop services. + if stop_services and not linstor.has_node(node_name): + update_linstor_satellite_service(start=False) + update_drbd_reactor_service(start=False) + update_all_ports(open_ports=False) + except Exception: + pass + + raise e + + +def remove_host(session, args): + group_name = args['groupName'] + + # 1. Find all LINSTOR SRs and PBDs. + srs = dict() + for sr_ref, sr in session.xenapi.SR.get_all_records().items(): + if sr.get('type') == 'linstor': + srs[sr_ref] = sr + + pbds = dict() + for pbd_ref, pbd in session.xenapi.PBD.get_all_records().items(): + device_config = pbd.get('device_config') + if ( + device_config and + device_config.get('group-name') == group_name + and pbd['SR'] in srs + ): + pbds[pbd_ref] = pbd + + # 2. Remove node. + linstor = LinstorVolumeManager( + get_controller_uri(), + group_name, + logger=util.SMlog + ) + + node_name = socket.gethostname() + if linstor.has_node(node_name): + linstor.destroy_node(node_name) + if linstor.has_node(node_name): + raise Exception('Failed to remove node! Unknown error.') + + this_host = util.get_this_host_ref(session) + + # 3. Remove PBD. + for pbd_ref, pbd in pbds.items(): + host = pbd['host'] + if host == this_host: + if pbd['currently_attached']: + session.xenapi.PBD.unplug(pbd_ref) + session.xenapi.PBD.destroy(pbd_ref) + break + + # 3. Stop services. + try: + update_linstor_satellite_service(start=False) + update_drbd_reactor_service(start=False) + update_all_ports(open_ports=False) + except Exception as e: + util.SMlog('Error while stopping services: {}'.format(e)) + pass + + return str('True') + + +def create_sr(session, args): + try: + # Use a complex parsing contrary to the other functions because + # this helper is a public method and is not easy to use. + name = args.get('name') + if not name: + raise Exception('`name` is empty') + + description = args.get('description') or '' + + disks = args.get('disks') + if not disks: + raise Exception('`disks` is empty') + try: + disks = json.loads(disks) + except Exception as e: + raise Exception('failed to decode `disks`: {}'.format(e)) + if type(disks) is not dict: + raise Exception('`disks` must be a JSON object') + + volume_group = args.get('volume_group') + if not volume_group: + raise Exception('`volume_group` is empty') + + redundancy = args.get('redundancy') + if not redundancy: + raise Exception('`redundancy` is empty') + + try: + redundancy = int(redundancy) + except Exception: + raise Exception('`redundancy` is not a number') + + provisioning = args.get('provisioning') + if not provisioning: + provisioning = 'thin' + elif provisioning != 'thin' and provisioning != 'thick': + raise Exception('unsupported provisioning') + + force = distutils.util.strtobool(args.get('force') or '0') + + return exec_create_sr( + session, name, description, disks, volume_group, redundancy, provisioning, force + ) + except Exception as e: + util.SMlog('linstor-manager:create_sr error: {}'.format(e)) + raise + + +def demote_drbd_resource(session, args): + try: + resource_name = args['resource_name'] + (ret, stdout, stderr) = util.doexec(['drbdsetup', 'secondary', resource_name]) + if ret: + raise Exception('Failed to demote resource: {}'.format(stderr)) + return str(True) + except Exception as e: + util.SMlog('linstor-manager:demote_drbd_resource error: {}'.format(e)) + return str(False) + + +def list_drbd_volumes(session, args): + try: + volume_group = args.get('volume_group') + return json.dumps(get_drbd_volumes(volume_group)) + except Exception as e: + util.SMlog('linstor-manager:list_drbd_volumes error: {}'.format(e)) + raise + + +def destroy_drbd_volume(session, args): + try: + minor = args.get('minor') + if not minor: + raise Exception('Cannot destroy DRBD volume without minor.') + force_destroy_drbd_volume(minor) + return str(True) + except Exception as e: + util.SMlog('linstor-manager:destroy_drbd_volume error: {}'.format(e)) + return str(False) + + +def destroy_drbd_volumes(session, args): + try: + volume_group = args.get('volume_group') + if not volume_group: + raise Exception('Cannot destroy DRBD volumes without volume group.') + for minor in get_drbd_volumes(volume_group).get(volume_group, []): + force_destroy_drbd_volume(str(minor)) + return str(True) + except Exception as e: + util.SMlog('linstor-manager:destroy_drbd_volumes error: {}'.format(e)) + return str(False) + + +def get_drbd_openers(session, args): + try: + resource_name = args.get('resourceName') + volume = args.get('volume') + return get_local_volume_openers(resource_name, volume) + except Exception as e: + util.SMlog('linstor-manager:get_drbd_openers error: {}'.format(e)) + raise + + +def health_check(session, args): + group_name = args['groupName'] + + result = { + 'controller-uri': '', + 'nodes': {}, + 'storage-pools': {}, + 'warnings': [], + 'errors': [] + } + + def format_result(): + return json.dumps(result) + + # 1. Get controller. + try: + controller_uri = get_controller_uri() + + result['controller-uri'] = controller_uri + try: + if controller_uri == 'linstor://localhost': + # Replace `localhost` with IP to give a better info for users. + result['controller-uri'] = 'linstor://' + util.get_this_host_address(session) + except Exception: + # Ignore error: can be a XAPI restart or something else. + pass + + linstor = LinstorVolumeManager( + controller_uri, + group_name, + logger=util.SMlog + ) + except Exception as e: + # Probably a network issue, or offline controller. + result['errors'].append('Cannot join SR: `{}`.'.format(e)) + return format_result() + + try: + # 2. Check node statuses. + nodes = linstor.get_nodes_info() + result['nodes'] = nodes + for node_name, status in nodes.items(): + if status != 'ONLINE': + result['warnings'].append('Node `{}` is {}.'.format(node_name, status)) + + # 3. Check storage pool statuses. + storage_pools_per_node = linstor.get_storage_pools_info() + result['storage-pools'] = storage_pools_per_node + for node_name, storage_pools in storage_pools_per_node.items(): + for storage_pool in storage_pools: + free_size = storage_pool['free-size'] + capacity = storage_pool['capacity'] + if free_size < 0 or capacity <= 0: + result['errors'].append( + 'Cannot get free size and/or capacity of storage pool `{}`.' + .format(storage_pool['uuid']) + ) + elif free_size > capacity: + result['errors'].append( + 'Free size of storage pool `{}` is greater than capacity.' + .format(storage_pool['uuid']) + ) + else: + remaining_percent = free_size / float(capacity) * 100.0 + threshold = 10.0 + if remaining_percent < threshold: + result['warnings'].append( + 'Remaining size of storage pool `{}` is below {}% of its capacity.' + .format(storage_pool['uuid'], threshold) + ) + + # 4. Check resource statuses. + all_resources = linstor.get_resources_info() + result['resources'] = all_resources + + for resource_name, resource_by_node in all_resources.items(): + for node_name, resource in resource_by_node.items(): + for volume_index, volume in enumerate(resource['volumes']): + disk_state = volume['disk-state'] + if disk_state in ['UpToDate', 'Created', 'Attached']: + continue + if disk_state == 'DUnknown': + result['warnings'].append( + 'Unknown state for volume `{}` at index {} for resource `{}` on node `{}`' + .format(volume['device-path'], volume_index, resource_name, node_name) + ) + continue + if disk_state in ['Inconsistent', 'Failed', 'To: Creating', 'To: Attachable', 'To: Attaching']: + result['errors'].append( + 'Invalid state `{}` for volume `{}` at index {} for resource `{}` on node `{}`' + .format(disk_state, volume['device-path'], volume_index, resource_name, node_name) + ) + continue + if disk_state == 'Diskless': + if resource['diskful']: + result['errors'].append( + 'Unintentional diskless state detected for volume `{}` at index {} for resource `{}` on node `{}`' + .format(volume['device-path'], volume_index, resource_name, node_name) + ) + elif resource['tie-breaker']: + volume['disk-state'] = 'TieBreaker' + continue + result['warnings'].append( + 'Unhandled state `{}` for volume `{}` at index {} for resource `{}` on node `{}`' + .format(disk_state, volume['device-path'], volume_index, resource_name, node_name) + ) + + except Exception as e: + result['errors'].append('Unexpected error: `{}`'.format(e)) + + return format_result() + + +if __name__ == '__main__': + XenAPIPlugin.dispatch({ + 'prepareSr': prepare_sr, + 'releaseSr': release_sr, + 'updateDrbdReactor': update_drbd_reactor, + 'attach': attach, + 'detach': detach, + 'destroy': destroy, + + # vhdutil wrappers called by linstorvhdutil. + # Note: When a VHD is open in RO mode (so for all vhdutil getters), + # the LVM layer is used directly to bypass DRBD verifications. + # In this case there can't be EROFS errors. + # Note 2: We assume linstorvhdutil executes remote calls on diskful + # DRBDs, otherwise we still have EROFS errors... + 'check': check, + 'getVHDInfo': get_vhd_info, + 'hasParent': has_parent, + 'getParent': get_parent, + 'getSizeVirt': get_size_virt, + 'getSizePhys': get_size_phys, + 'getDepth': get_depth, + 'getKeyHash': get_key_hash, + 'getBlockBitmap': get_block_bitmap, + + # Called by cleanup.py to coalesce when a primary + # is opened on a non-local host. + 'setParent': set_parent, + 'coalesce': coalesce, + 'repair': repair, + + 'lockVdi': lock_vdi, + 'hasControllerRunning': has_controller_running, + 'addHost': add_host, + 'removeHost': remove_host, + 'createSr': create_sr, + 'listDrbdVolumes': list_drbd_volumes, + 'demoteDrbdResource': demote_drbd_resource, + 'destroyDrbdVolume': destroy_drbd_volume, + 'destroyDrbdVolumes': destroy_drbd_volumes, + 'getDrbdOpeners': get_drbd_openers, + 'healthCheck': health_check + }) diff --git a/drivers/linstorjournaler.py b/drivers/linstorjournaler.py new file mode 100755 index 00000000..1e85ec96 --- /dev/null +++ b/drivers/linstorjournaler.py @@ -0,0 +1,175 @@ +#!/usr/bin/env python +# +# Copyright (C) 2020 Vates SAS - ronan.abhamon@vates.fr +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + + +from linstorvolumemanager import \ + get_controller_uri, LinstorVolumeManager, LinstorVolumeManagerError +import linstor +import re +import util + + +class LinstorJournalerError(Exception): + pass + +# ============================================================================== + + +class LinstorJournaler: + """ + Simple journaler that uses LINSTOR properties for persistent "storage". + A journal is a id-value pair, and there can be only one journal for a + given id. An identifier is juste a transaction name. + """ + + REG_TYPE = re.compile('^([^/]+)$') + REG_TRANSACTION = re.compile('^[^/]+/([^/]+)$') + + """ + Types of transaction in the journal. + """ + CLONE = 'clone' + INFLATE = 'inflate' + + @staticmethod + def default_logger(*args): + print(args) + + def __init__(self, uri, group_name, logger=default_logger.__func__): + self._namespace = '{}journal/'.format( + LinstorVolumeManager._build_sr_namespace() + ) + self._logger = logger + self._journal = self._create_journal_instance( + uri, group_name, self._namespace + ) + + def create(self, type, identifier, value): + # TODO: Maybe rename to 'add' in the future (in Citrix code too). + + key = self._get_key(type, identifier) + + # 1. Ensure transaction doesn't exist. + current_value = self.get(type, identifier) + if current_value is not None: + raise LinstorJournalerError( + 'Journal transaction already exists for \'{}:{}\': {}' + .format(type, identifier, current_value) + ) + + # 2. Write! + try: + self._reset_namespace() + self._logger( + 'Create journal transaction \'{}:{}\''.format(type, identifier) + ) + self._journal[key] = str(value) + except Exception as e: + try: + self._journal.pop(key, 'empty') + except Exception as e2: + self._logger( + 'Failed to clean up failed journal write: {} (Ignored)' + .format(e2) + ) + + raise LinstorJournalerError( + 'Failed to write to journal: {}'.format(e) + ) + + def remove(self, type, identifier): + key = self._get_key(type, identifier) + try: + self._reset_namespace() + self._logger( + 'Destroy journal transaction \'{}:{}\'' + .format(type, identifier) + ) + self._journal.pop(key) + except Exception as e: + raise LinstorJournalerError( + 'Failed to remove transaction \'{}:{}\': {}' + .format(type, identifier, e) + ) + + def get(self, type, identifier): + self._reset_namespace() + return self._journal.get(self._get_key(type, identifier)) + + def get_all(self, type): + entries = {} + + self._journal.namespace = self._namespace + '{}/'.format(type) + for (key, value) in self._journal.items(): + res = self.REG_TYPE.match(key) + if res: + identifier = res.groups()[0] + entries[identifier] = value + return entries + + # Added to compatibility with Citrix API. + def getAll(self, type): + return self.get_all(type) + + def has_entries(self, identifier): + self._reset_namespace() + for (key, value) in self._journal.items(): + res = self.REG_TRANSACTION.match(key) + if res: + current_identifier = res.groups()[0] + if current_identifier == identifier: + return True + return False + + # Added to compatibility with Citrix API. + def hasJournals(self, identifier): + return self.has_entries(identifier) + + def _reset_namespace(self): + self._journal.namespace = self._namespace + + @classmethod + def _create_journal_instance(cls, uri, group_name, namespace): + def connect(uri): + if not uri: + uri = get_controller_uri() + if not uri: + raise LinstorVolumeManagerError( + 'Unable to find controller uri...' + ) + return linstor.KV( + LinstorVolumeManager._build_group_name(group_name), + uri=uri, + namespace=namespace + ) + + try: + return connect(uri) + except (linstor.errors.LinstorNetworkError, LinstorVolumeManagerError): + pass + + return util.retry( + lambda: connect(None), + maxretry=10, + exceptions=[ + linstor.errors.LinstorNetworkError, LinstorVolumeManagerError + ] + ) + + @staticmethod + def _get_key(type, identifier): + return '{}/{}'.format(type, identifier) diff --git a/drivers/linstorvhdutil.py b/drivers/linstorvhdutil.py new file mode 100644 index 00000000..8b6985d9 --- /dev/null +++ b/drivers/linstorvhdutil.py @@ -0,0 +1,432 @@ +#!/usr/bin/env python +# +# Copyright (C) 2020 Vates SAS - ronan.abhamon@vates.fr +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import base64 +import distutils.util +import errno +import json +import socket +import util +import vhdutil +import xs_errors + +MANAGER_PLUGIN = 'linstor-manager' + +# EMEDIUMTYPE constant (124) is not available in python2. +EMEDIUMTYPE = 124 + + +def call_vhd_util_on_host(session, host_ref, method, device_path, args): + try: + response = session.xenapi.host.call_plugin( + host_ref, MANAGER_PLUGIN, method, args + ) + except Exception as e: + util.SMlog('call-plugin ({} with {}) exception: {}'.format( + method, args, e + )) + raise + + util.SMlog('call-plugin ({} with {}) returned: {}'.format( + method, args, response + )) + + return response + + +class LinstorCallException(Exception): + def __init__(self, cmd_err): + self.cmd_err = cmd_err + + def __str__(self): + return str(self.cmd_err) + + +class ErofsLinstorCallException(LinstorCallException): + pass + + +class NoPathLinstorCallException(LinstorCallException): + pass + + +def linstorhostcall(local_method, remote_method): + def decorated(response_parser): + def wrapper(*args, **kwargs): + self = args[0] + vdi_uuid = args[1] + + device_path = self._linstor.build_device_path( + self._linstor.get_volume_name(vdi_uuid) + ) + + # A. Try a call using directly the DRBD device to avoid + # remote request. + + # Try to read locally if the device is not in use or if the device + # is up to date and not diskless. + (node_names, in_use_by) = \ + self._linstor.find_up_to_date_diskful_nodes(vdi_uuid) + + local_e = None + try: + if not in_use_by or socket.gethostname() in node_names: + return self._call_local_vhd_util(local_method, device_path, *args[2:], **kwargs) + except ErofsLinstorCallException as e: + local_e = e.cmd_err + except Exception as e: + local_e = e + + util.SMlog( + 'unable to execute `{}` locally, retry using a readable host... (cause: {})'.format( + remote_method, local_e if local_e else 'local diskless + in use or not up to date' + ) + ) + + if in_use_by: + node_names = {in_use_by} + + # B. Execute the plugin on master or slave. + remote_args = { + 'devicePath': device_path, + 'groupName': self._linstor.group_name + } + remote_args.update(**kwargs) + remote_args = {str(key): str(value) for key, value in remote_args.iteritems()} + + try: + def remote_call(): + host_ref = self._get_readonly_host(vdi_uuid, device_path, node_names) + return call_vhd_util_on_host(self._session, host_ref, remote_method, device_path, remote_args) + response = util.retry(remote_call, 5, 2) + except Exception as remote_e: + self._raise_openers_exception(device_path, local_e or remote_e) + + return response_parser(self, vdi_uuid, response) + return wrapper + return decorated + + +def linstormodifier(): + def decorated(func): + def wrapper(*args, **kwargs): + self = args[0] + + ret = func(*args, **kwargs) + self._linstor.invalidate_resource_cache() + return ret + return wrapper + return decorated + + +class LinstorVhdUtil: + def __init__(self, session, linstor): + self._session = session + self._linstor = linstor + + # -------------------------------------------------------------------------- + # Getters: read locally and try on another host in case of failure. + # -------------------------------------------------------------------------- + + def check(self, vdi_uuid, ignore_missing_footer=False, fast=False): + kwargs = { + 'ignoreMissingFooter': ignore_missing_footer, + 'fast': fast + } + return self._check(vdi_uuid, **kwargs) # pylint: disable = E1123 + + @linstorhostcall(vhdutil.check, 'check') + def _check(self, vdi_uuid, response): + return distutils.util.strtobool(response) + + def get_vhd_info(self, vdi_uuid, include_parent=True): + kwargs = { + 'includeParent': include_parent, + 'resolveParent': False + } + # TODO: Replace pylint comment with this feature when possible: + # https://github.com/PyCQA/pylint/pull/2926 + return self._get_vhd_info(vdi_uuid, self._extract_uuid, **kwargs) # pylint: disable = E1123 + + @linstorhostcall(vhdutil.getVHDInfo, 'getVHDInfo') + def _get_vhd_info(self, vdi_uuid, response): + obj = json.loads(response) + + vhd_info = vhdutil.VHDInfo(vdi_uuid) + vhd_info.sizeVirt = obj['sizeVirt'] + vhd_info.sizePhys = obj['sizePhys'] + if 'parentPath' in obj: + vhd_info.parentPath = obj['parentPath'] + vhd_info.parentUuid = obj['parentUuid'] + vhd_info.hidden = obj['hidden'] + vhd_info.path = obj['path'] + + return vhd_info + + @linstorhostcall(vhdutil.hasParent, 'hasParent') + def has_parent(self, vdi_uuid, response): + return distutils.util.strtobool(response) + + def get_parent(self, vdi_uuid): + return self._get_parent(vdi_uuid, self._extract_uuid) + + @linstorhostcall(vhdutil.getParent, 'getParent') + def _get_parent(self, vdi_uuid, response): + return response + + @linstorhostcall(vhdutil.getSizeVirt, 'getSizeVirt') + def get_size_virt(self, vdi_uuid, response): + return int(response) + + @linstorhostcall(vhdutil.getSizePhys, 'getSizePhys') + def get_size_phys(self, vdi_uuid, response): + return int(response) + + @linstorhostcall(vhdutil.getDepth, 'getDepth') + def get_depth(self, vdi_uuid, response): + return int(response) + + @linstorhostcall(vhdutil.getKeyHash, 'getKeyHash') + def get_key_hash(self, vdi_uuid, response): + return response or None + + @linstorhostcall(vhdutil.getBlockBitmap, 'getBlockBitmap') + def get_block_bitmap(self, vdi_uuid, response): + return base64.b64decode(response) + + # -------------------------------------------------------------------------- + # Setters: only used locally. + # -------------------------------------------------------------------------- + + @linstormodifier() + def create(self, path, size, static, msize=0): + return self._call_local_vhd_util_or_fail(vhdutil.create, path, size, static, msize) + + @linstormodifier() + def set_size_virt_fast(self, path, size): + return self._call_local_vhd_util_or_fail(vhdutil.setSizeVirtFast, path, size) + + @linstormodifier() + def set_size_phys(self, path, size, debug=True): + return self._call_local_vhd_util_or_fail(vhdutil.setSizePhys, path, size, debug) + + @linstormodifier() + def set_parent(self, path, parentPath, parentRaw=False): + return self._call_local_vhd_util_or_fail(vhdutil.setParent, path, parentPath, parentRaw) + + @linstormodifier() + def set_hidden(self, path, hidden=True): + return self._call_local_vhd_util_or_fail(vhdutil.setHidden, path, hidden) + + @linstormodifier() + def set_key(self, path, key_hash): + return self._call_local_vhd_util_or_fail(vhdutil.setKey, path, key_hash) + + @linstormodifier() + def kill_data(self, path): + return self._call_local_vhd_util_or_fail(vhdutil.killData, path) + + @linstormodifier() + def snapshot(self, path, parent, parentRaw, msize=0, checkEmpty=True): + return self._call_local_vhd_util_or_fail(vhdutil.snapshot, path, parent, parentRaw, msize, checkEmpty) + + # -------------------------------------------------------------------------- + # Remote setters: write locally and try on another host in case of failure. + # -------------------------------------------------------------------------- + + @linstormodifier() + def force_parent(self, path, parentPath, parentRaw=False): + kwargs = { + 'parentPath': str(parentPath), + 'parentRaw': parentRaw + } + return self._call_vhd_util(vhdutil.setParent, 'setParent', path, use_parent=False, **kwargs) + + @linstormodifier() + def force_coalesce(self, path): + return self._call_vhd_util(vhdutil.coalesce, 'coalesce', path, use_parent=True) + + @linstormodifier() + def force_repair(self, path): + return self._call_vhd_util(vhdutil.repair, 'repair', path, use_parent=False) + + # -------------------------------------------------------------------------- + # Helpers. + # -------------------------------------------------------------------------- + + def _extract_uuid(self, device_path): + # TODO: Remove new line in the vhdutil module. Not here. + return self._linstor.get_volume_uuid_from_device_path( + device_path.rstrip('\n') + ) + + def _get_readonly_host(self, vdi_uuid, device_path, node_names): + """ + When vhd-util is called to fetch VDI info we must find a + diskful DRBD disk to read the data. It's the goal of this function. + Why? Because when a VHD is open in RO mode, the LVM layer is used + directly to bypass DRBD verifications (we can have only one process + that reads/writes to disk with DRBD devices). + """ + + if not node_names: + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='Unable to find diskful node: {} (path={})' + .format(vdi_uuid, device_path) + ) + + hosts = self._session.xenapi.host.get_all_records() + for host_ref, host_record in hosts.items(): + if host_record['hostname'] in node_names: + return host_ref + + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='Unable to find a valid host from VDI: {} (path={})' + .format(vdi_uuid, device_path) + ) + + # -------------------------------------------------------------------------- + + def _raise_openers_exception(self, device_path, e): + if isinstance(e, util.CommandException): + e_str = 'cmd: `{}`, code: `{}`, reason: `{}`'.format(e.cmd, e.code, e.reason) + else: + e_str = str(e) + + e_with_openers = None + try: + volume_uuid = self._linstor.get_volume_uuid_from_device_path( + device_path + ) + e_wrapper = Exception( + e_str + ' (openers: {})'.format( + self._linstor.get_volume_openers(volume_uuid) + ) + ) + except Exception as illformed_e: + e_wrapper = Exception( + e_str + ' (unable to get openers: {})'.format(illformed_e) + ) + util.SMlog('raise opener exception: {}'.format(e_wrapper)) + raise e_wrapper # pylint: disable = E0702 + + def _call_local_vhd_util(self, local_method, device_path, *args, **kwargs): + try: + def local_call(): + try: + return local_method(device_path, *args, **kwargs) + except util.CommandException as e: + if e.code == errno.EROFS or e.code == EMEDIUMTYPE: + raise ErofsLinstorCallException(e) # Break retry calls. + if e.code == errno.ENOENT: + raise NoPathLinstorCallException(e) + raise e + # Retry only locally if it's not an EROFS exception. + return util.retry(local_call, 5, 2, exceptions=[util.CommandException]) + except util.CommandException as e: + util.SMlog('failed to execute locally vhd-util (sys {})'.format(e.code)) + raise e + + def _call_local_vhd_util_or_fail(self, local_method, device_path, *args, **kwargs): + try: + return self._call_local_vhd_util(local_method, device_path, *args, **kwargs) + except ErofsLinstorCallException as e: + # Volume is locked on a host, find openers. + self._raise_openers_exception(device_path, e.cmd_err) + + def _call_vhd_util(self, local_method, remote_method, device_path, use_parent, *args, **kwargs): + # Note: `use_parent` exists to know if the VHD parent is used by the local/remote method. + # Normally in case of failure, if the parent is unused we try to execute the method on + # another host using the DRBD opener list. In the other case, if the parent is required, + # we must check where this last one is open instead of the child. + + # A. Try to write locally... + try: + return self._call_local_vhd_util(local_method, device_path, *args, **kwargs) + except Exception: + pass + + util.SMlog('unable to execute `{}` locally, retry using a writable host...'.format(remote_method)) + + # B. Execute the command on another host. + # B.1. Get host list. + try: + hosts = self._session.xenapi.host.get_all_records() + except Exception as e: + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='Unable to get host list to run vhd-util command `{}` (path={}): {}' + .format(remote_method, device_path, e) + ) + + # B.2. Prepare remote args. + remote_args = { + 'devicePath': device_path, + 'groupName': self._linstor.group_name + } + remote_args.update(**kwargs) + remote_args = {str(key): str(value) for key, value in remote_args.iteritems()} + + volume_uuid = self._linstor.get_volume_uuid_from_device_path( + device_path + ) + parent_volume_uuid = None + if use_parent: + parent_volume_uuid = self.get_parent(volume_uuid) + + openers_uuid = parent_volume_uuid if use_parent else volume_uuid + + # B.3. Call! + def remote_call(): + try: + all_openers = self._linstor.get_volume_openers(openers_uuid) + except Exception as e: + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='Unable to get DRBD openers to run vhd-util command `{}` (path={}): {}' + .format(remote_method, device_path, e) + ) + + no_host_found = True + for hostname, openers in all_openers.iteritems(): + if not openers: + continue + + try: + host_ref = next(ref for ref, rec in hosts.iteritems() if rec['hostname'] == hostname) + except StopIteration: + continue + + no_host_found = False + try: + return call_vhd_util_on_host(self._session, host_ref, remote_method, device_path, remote_args) + except Exception: + pass + + if no_host_found: + try: + return local_method(device_path, *args, **kwargs) + except Exception as e: + self._raise_openers_exception(device_path, e) + + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='No valid host found to run vhd-util command `{}` (path=`{}`, openers=`{}`): {}' + .format(remote_method, device_path, openers, e) + ) + return util.retry(remote_call, 5, 2) diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py new file mode 100755 index 00000000..97815734 --- /dev/null +++ b/drivers/linstorvolumemanager.py @@ -0,0 +1,3003 @@ +#!/usr/bin/env python +# +# Copyright (C) 2020 Vates SAS - ronan.abhamon@vates.fr +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + + +import distutils.util +import errno +import glob +import json +import linstor +import os.path +import re +import shutil +import socket +import stat +import time +import util +import uuid + + +# Contains the data of the "/var/lib/linstor" directory. +DATABASE_VOLUME_NAME = 'xcp-persistent-database' +DATABASE_SIZE = 1 << 30 # 1GB. +DATABASE_PATH = '/var/lib/linstor' +DATABASE_MKFS = 'mkfs.ext4' + +REG_DRBDADM_PRIMARY = re.compile("([^\\s]+)\\s+role:Primary") +REG_DRBDSETUP_IP = re.compile('[^\\s]+\\s+(.*):.*$') + +DRBD_BY_RES_PATH = '/dev/drbd/by-res/' + +PLUGIN = 'linstor-manager' + + +# ============================================================================== + +def get_local_volume_openers(resource_name, volume): + if not resource_name or volume is None: + raise Exception('Cannot get DRBD openers without resource name and/or volume.') + + path = '/sys/kernel/debug/drbd/resources/{}/volumes/{}/openers'.format( + resource_name, volume + ) + + with open(path, 'r') as openers: + # Not a big cost, so read all lines directly. + lines = openers.readlines() + + result = {} + + opener_re = re.compile('(.*)\\s+([0-9]+)\\s+([0-9]+)') + for line in lines: + match = opener_re.match(line) + assert match + + groups = match.groups() + process_name = groups[0] + pid = groups[1] + open_duration_ms = groups[2] + result[pid] = { + 'process-name': process_name, + 'open-duration': open_duration_ms + } + + return json.dumps(result) + +def get_all_volume_openers(resource_name, volume): + PLUGIN_CMD = 'getDrbdOpeners' + + volume = str(volume) + openers = {} + + # Make sure this call never stucks because this function can be called + # during HA init and in this case we can wait forever. + session = util.timeout_call(10, util.get_localAPI_session) + + hosts = session.xenapi.host.get_all_records() + for host_ref, host_record in hosts.items(): + node_name = host_record['hostname'] + try: + if not session.xenapi.host_metrics.get_record( + host_record['metrics'] + )['live']: + # Ensure we call plugin on online hosts only. + continue + + openers[node_name] = json.loads( + session.xenapi.host.call_plugin(host_ref, PLUGIN, PLUGIN_CMD, { + 'resourceName': resource_name, + 'volume': volume + }) + ) + except Exception as e: + util.SMlog('Failed to get openers of `{}` on `{}`: {}'.format( + resource_name, node_name, e + )) + + return openers + + +# ============================================================================== + +def round_up(value, divisor): + assert divisor + divisor = int(divisor) + return int((int(value) + divisor - 1) / divisor) * divisor + + +def round_down(value, divisor): + assert divisor + value = int(value) + return value - (value % int(divisor)) + + +# ============================================================================== + +def get_remote_host_ip(node_name): + (ret, stdout, stderr) = util.doexec([ + 'drbdsetup', 'show', DATABASE_VOLUME_NAME, '--json' + ]) + if ret != 0: + return + + try: + conf = json.loads(stdout) + if not conf: + return + + for connection in conf[0]['connections']: + if connection['net']['_name'] == node_name: + value = connection['path']['_remote_host'] + res = REG_DRBDSETUP_IP.match(value) + if res: + return res.groups()[0] + break + except Exception: + pass + + +def _get_controller_uri(): + PLUGIN_CMD = 'hasControllerRunning' + + # Try to find controller using drbdadm. + (ret, stdout, stderr) = util.doexec([ + 'drbdadm', 'status', DATABASE_VOLUME_NAME + ]) + if ret == 0: + # If we are here, the database device exists locally. + + if stdout.startswith('{} role:Primary'.format(DATABASE_VOLUME_NAME)): + # Nice case, we have the controller running on this local host. + return 'linstor://localhost' + + # Try to find the host using DRBD connections. + res = REG_DRBDADM_PRIMARY.search(stdout) + if res: + node_name = res.groups()[0] + ip = get_remote_host_ip(node_name) + if ip: + return 'linstor://' + ip + + # Worst case: we use many hosts in the pool (>= 4), so we can't find the + # primary using drbdadm because we don't have all connections to the + # replicated volume. `drbdadm status xcp-persistent-database` returns + # 3 connections by default. + try: + session = util.timeout_call(10, util.get_localAPI_session) + + for host_ref, host_record in session.xenapi.host.get_all_records().items(): + node_name = host_record['hostname'] + try: + if distutils.util.strtobool( + session.xenapi.host.call_plugin(host_ref, PLUGIN, PLUGIN_CMD, {}) + ): + return 'linstor://' + host_record['address'] + except Exception as e: + # Can throw and exception if a host is offline. So catch it. + util.SMlog('Unable to search controller on `{}`: {}'.format( + node_name, e + )) + except: + # Not found, maybe we are trying to create the SR... + pass + +def get_controller_uri(): + retries = 0 + while True: + uri = _get_controller_uri() + if uri: + return uri + + retries += 1 + if retries >= 10: + break + time.sleep(1) + + +def get_controller_node_name(): + PLUGIN_CMD = 'hasControllerRunning' + + (ret, stdout, stderr) = util.doexec([ + 'drbdadm', 'status', DATABASE_VOLUME_NAME + ]) + + if ret == 0: + if stdout.startswith('{} role:Primary'.format(DATABASE_VOLUME_NAME)): + return 'localhost' + + res = REG_DRBDADM_PRIMARY.search(stdout) + if res: + return res.groups()[0] + + session = util.timeout_call(5, util.get_localAPI_session) + + for host_ref, host_record in session.xenapi.host.get_all_records().items(): + node_name = host_record['hostname'] + try: + if not session.xenapi.host_metrics.get_record( + host_record['metrics'] + )['live']: + continue + + if distutils.util.strtobool(session.xenapi.host.call_plugin( + host_ref, PLUGIN, PLUGIN_CMD, {} + )): + return node_name + except Exception as e: + util.SMlog('Failed to call plugin to get controller on `{}`: {}'.format( + node_name, e + )) + + +def demote_drbd_resource(node_name, resource_name): + PLUGIN_CMD = 'demoteDrbdResource' + + session = util.timeout_call(5, util.get_localAPI_session) + + for host_ref, host_record in session.xenapi.host.get_all_records().items(): + if host_record['hostname'] != node_name: + continue + + try: + session.xenapi.host.call_plugin( + host_ref, PLUGIN, PLUGIN_CMD, {'resource_name': resource_name} + ) + except Exception as e: + util.SMlog('Failed to demote resource `{}` on `{}`: {}'.format( + resource_name, node_name, e + )) + raise Exception( + 'Can\'t demote resource `{}`, unable to find node `{}`' + .format(resource_name, node_name) + ) + +# ============================================================================== + +class LinstorVolumeManagerError(Exception): + ERR_GENERIC = 0, + ERR_VOLUME_EXISTS = 1, + ERR_VOLUME_NOT_EXISTS = 2 + + def __init__(self, message, code=ERR_GENERIC): + super(LinstorVolumeManagerError, self).__init__(message) + self._code = code + + @property + def code(self): + return self._code + + +# ============================================================================== + +# Note: +# If a storage pool is not accessible after a network change: +# linstor node interface modify default --ip + + +class LinstorVolumeManager(object): + """ + API to manager LINSTOR volumes in XCP-ng. + A volume in this context is a physical part of the storage layer. + """ + + __slots__ = ( + '_linstor', '_logger', + '_uri', '_base_group_name', + '_redundancy', '_group_name', + '_volumes', '_storage_pools', + '_storage_pools_time', + '_kv_cache', '_resource_cache', '_volume_info_cache', + '_kv_cache_dirty', '_resource_cache_dirty', '_volume_info_cache_dirty' + ) + + DEV_ROOT_PATH = DRBD_BY_RES_PATH + + # Default LVM extent size. + BLOCK_SIZE = 4 * 1024 * 1024 + + # List of volume properties. + PROP_METADATA = 'metadata' + PROP_NOT_EXISTS = 'not-exists' + PROP_VOLUME_NAME = 'volume-name' + PROP_IS_READONLY_TIMESTAMP = 'readonly-timestamp' + + # A volume can only be locked for a limited duration. + # The goal is to give enough time to slaves to execute some actions on + # a device before an UUID update or a coalesce for example. + # Expiration is expressed in seconds. + LOCKED_EXPIRATION_DELAY = 1 * 60 + + # Used when volume uuid is being updated. + PROP_UPDATING_UUID_SRC = 'updating-uuid-src' + + # States of property PROP_NOT_EXISTS. + STATE_EXISTS = '0' + STATE_NOT_EXISTS = '1' + STATE_CREATING = '2' + + # Property namespaces. + NAMESPACE_SR = 'xcp/sr' + NAMESPACE_VOLUME = 'xcp/volume' + + # Regex to match properties. + REG_PROP = '^([^/]+)/{}$' + + REG_METADATA = re.compile(REG_PROP.format(PROP_METADATA)) + REG_NOT_EXISTS = re.compile(REG_PROP.format(PROP_NOT_EXISTS)) + REG_VOLUME_NAME = re.compile(REG_PROP.format(PROP_VOLUME_NAME)) + REG_UPDATING_UUID_SRC = re.compile(REG_PROP.format(PROP_UPDATING_UUID_SRC)) + + # Prefixes of SR/VOLUME in the LINSTOR DB. + # A LINSTOR (resource, group, ...) name cannot start with a number. + # So we add a prefix behind our SR/VOLUME uuids. + PREFIX_SR = 'xcp-sr-' + PREFIX_VOLUME = 'xcp-volume-' + + # Limit request number when storage pool info is asked, we fetch + # the current pool status after N elapsed seconds. + STORAGE_POOLS_FETCH_INTERVAL = 15 + + @staticmethod + def default_logger(*args): + print(args) + + # -------------------------------------------------------------------------- + # API. + # -------------------------------------------------------------------------- + + class VolumeInfo(object): + __slots__ = ( + 'name', + 'allocated_size', # Allocated size, place count is not used. + 'virtual_size', # Total virtual available size of this volume + # (i.e. the user size at creation). + 'diskful' # Array of nodes that have a diskful volume. + ) + + def __init__(self, name): + self.name = name + self.allocated_size = 0 + self.virtual_size = 0 + self.diskful = [] + + def __repr__(self): + return 'VolumeInfo("{}", {}, {}, {})'.format( + self.name, self.allocated_size, self.virtual_size, + self.diskful + ) + + # -------------------------------------------------------------------------- + + def __init__( + self, uri, group_name, repair=False, logger=default_logger.__func__, + attempt_count=30 + ): + """ + Create a new LinstorVolumeManager object. + :param str uri: URI to communicate with the LINSTOR controller. + :param str group_name: The SR goup name to use. + :param bool repair: If true we try to remove bad volumes due to a crash + or unexpected behavior. + :param function logger: Function to log messages. + :param int attempt_count: Number of attempts to join the controller. + """ + + self._linstor = self._create_linstor_instance( + uri, attempt_count=attempt_count + ) + self._base_group_name = group_name + + # Ensure group exists. + group_name = self._build_group_name(group_name) + groups = self._linstor.resource_group_list_raise([group_name]) + groups = groups.resource_groups + if not groups: + raise LinstorVolumeManagerError( + 'Unable to find `{}` Linstor SR'.format(group_name) + ) + + # Ok. ;) + self._logger = logger + self._redundancy = groups[0].select_filter.place_count + self._group_name = group_name + self._volumes = set() + self._storage_pools_time = 0 + + # To increate performance and limit request count to LINSTOR services, + # we use caches. + self._kv_cache = self._create_kv_cache() + self._resource_cache = None + self._resource_cache_dirty = True + self._volume_info_cache = None + self._volume_info_cache_dirty = True + self._build_volumes(repair=repair) + + @property + def group_name(self): + """ + Give the used group name. + :return: The group name. + :rtype: str + """ + return self._base_group_name + + @property + def redundancy(self): + """ + Give the used redundancy. + :return: The redundancy. + :rtype: int + """ + return self._redundancy + + @property + def volumes(self): + """ + Give the volumes uuid set. + :return: The volumes uuid set. + :rtype: set(str) + """ + return self._volumes + + @property + def max_volume_size_allowed(self): + """ + Give the max volume size currently available in B. + :return: The current size. + :rtype: int + """ + + candidates = self._find_best_size_candidates() + if not candidates: + raise LinstorVolumeManagerError( + 'Failed to get max volume size allowed' + ) + + size = candidates[0].max_volume_size + if size < 0: + raise LinstorVolumeManagerError( + 'Invalid max volume size allowed given: {}'.format(size) + ) + return self.round_down_volume_size(size * 1024) + + @property + def physical_size(self): + """ + Give the total physical size of the SR. + :return: The physical size. + :rtype: int + """ + return self._compute_size('total_capacity') + + @property + def physical_free_size(self): + """ + Give the total free physical size of the SR. + :return: The physical free size. + :rtype: int + """ + return self._compute_size('free_capacity') + + @property + def allocated_volume_size(self): + """ + Give the allocated size for all volumes. The place count is not + used here. When thick lvm is used, the size for one volume should + be equal to the virtual volume size. With thin lvm, the size is equal + or lower to the volume size. + :return: The allocated size of all volumes. + :rtype: int + """ + + # Paths: /res_name/vol_number/size + sizes = {} + + for resource in self._get_resource_cache().resources: + if resource.name not in sizes: + current = sizes[resource.name] = {} + else: + current = sizes[resource.name] + + for volume in resource.volumes: + # We ignore diskless pools of the form "DfltDisklessStorPool". + if volume.storage_pool_name != self._group_name: + continue + + current_size = volume.allocated_size + if current_size < 0: + raise LinstorVolumeManagerError( + 'Failed to get allocated size of `{}` on `{}`' + .format(resource.name, volume.storage_pool_name) + ) + current[volume.number] = max(current_size, current.get(volume.number) or 0) + + total_size = 0 + for volumes in sizes.itervalues(): + for size in volumes.itervalues(): + total_size += size + + return total_size * 1024 + + def get_min_physical_size(self): + """ + Give the minimum physical size of the SR. + I.e. the size of the smallest disk + the number of pools. + :return: The physical min size. + :rtype: tuple(int, int) + """ + size = None + pool_count = 0 + for pool in self._get_storage_pools(force=True): + space = pool.free_space + if space: + pool_count += 1 + current_size = space.total_capacity + if current_size < 0: + raise LinstorVolumeManagerError( + 'Failed to get pool total_capacity attr of `{}`' + .format(pool.node_name) + ) + if size is None or current_size < size: + size = current_size + return (pool_count, (size or 0) * 1024) + + @property + def metadata(self): + """ + Get the metadata of the SR. + :return: Dictionary that contains metadata. + :rtype: dict(str, dict) + """ + + sr_properties = self._get_sr_properties() + metadata = sr_properties.get(self.PROP_METADATA) + if metadata is not None: + metadata = json.loads(metadata) + if isinstance(metadata, dict): + return metadata + raise LinstorVolumeManagerError( + 'Expected dictionary in SR metadata: {}'.format( + self._group_name + ) + ) + + return {} + + @metadata.setter + def metadata(self, metadata): + """ + Set the metadata of the SR. + :param dict metadata: Dictionary that contains metadata. + """ + + assert isinstance(metadata, dict) + sr_properties = self._get_sr_properties() + sr_properties[self.PROP_METADATA] = json.dumps(metadata) + + @property + def disconnected_hosts(self): + """ + Get the list of disconnected hosts. + :return: Set that contains disconnected hosts. + :rtype: set(str) + """ + + disconnected_hosts = set() + for pool in self._get_storage_pools(): + for report in pool.reports: + if report.ret_code & linstor.consts.WARN_NOT_CONNECTED == \ + linstor.consts.WARN_NOT_CONNECTED: + disconnected_hosts.add(pool.node_name) + break + return disconnected_hosts + + def check_volume_exists(self, volume_uuid): + """ + Check if a volume exists in the SR. + :return: True if volume exists. + :rtype: bool + """ + return volume_uuid in self._volumes + + def create_volume( + self, volume_uuid, size, persistent=True, volume_name=None + ): + """ + Create a new volume on the SR. + :param str volume_uuid: The volume uuid to use. + :param int size: volume size in B. + :param bool persistent: If false the volume will be unavailable + on the next constructor call LinstorSR(...). + :param str volume_name: If set, this name is used in the LINSTOR + database instead of a generated name. + :return: The current device path of the volume. + :rtype: str + """ + + self._logger('Creating LINSTOR volume {}...'.format(volume_uuid)) + if not volume_name: + volume_name = self.build_volume_name(util.gen_uuid()) + volume_properties = self._create_volume_with_properties( + volume_uuid, volume_name, size, place_resources=True + ) + + # Volume created! Now try to find the device path. + try: + self._logger( + 'Find device path of LINSTOR volume {}...'.format(volume_uuid) + ) + device_path = self._find_device_path(volume_uuid, volume_name) + if persistent: + volume_properties[self.PROP_NOT_EXISTS] = self.STATE_EXISTS + self._volumes.add(volume_uuid) + self._logger( + 'LINSTOR volume {} created!'.format(volume_uuid) + ) + return device_path + except Exception as e: + # There is an issue to find the path. + # At this point the volume has just been created, so force flag can be used. + self._destroy_volume(volume_uuid, force=True) + raise + + def mark_volume_as_persistent(self, volume_uuid): + """ + Mark volume as persistent if created with persistent=False. + :param str volume_uuid: The volume uuid to mark. + """ + + self._ensure_volume_exists(volume_uuid) + + # Mark volume as persistent. + volume_properties = self._get_volume_properties(volume_uuid) + volume_properties[self.PROP_NOT_EXISTS] = self.STATE_EXISTS + + def destroy_volume(self, volume_uuid): + """ + Destroy a volume. + :param str volume_uuid: The volume uuid to destroy. + """ + + self._ensure_volume_exists(volume_uuid) + self.ensure_volume_is_not_locked(volume_uuid) + + # Mark volume as destroyed. + volume_properties = self._get_volume_properties(volume_uuid) + volume_properties[self.PROP_NOT_EXISTS] = self.STATE_NOT_EXISTS + + self._volumes.remove(volume_uuid) + self._destroy_volume(volume_uuid) + + def lock_volume(self, volume_uuid, locked=True): + """ + Prevent modifications of the volume properties during + "self.LOCKED_EXPIRATION_DELAY" seconds. The SR must be locked + when used. This method is useful to attach/detach correctly a volume on + a slave. Without it the GC can rename a volume, in this case the old + volume path can be used by a slave... + :param str volume_uuid: The volume uuid to protect/unprotect. + :param bool locked: Lock/unlock the volume. + """ + + self._ensure_volume_exists(volume_uuid) + + self._logger( + '{} volume {} as locked'.format( + 'Mark' if locked else 'Unmark', + volume_uuid + ) + ) + + volume_properties = self._get_volume_properties(volume_uuid) + if locked: + volume_properties[ + self.PROP_IS_READONLY_TIMESTAMP + ] = str(time.time()) + elif self.PROP_IS_READONLY_TIMESTAMP in volume_properties: + volume_properties.pop(self.PROP_IS_READONLY_TIMESTAMP) + + def ensure_volume_is_not_locked(self, volume_uuid, timeout=None): + """ + Ensure a volume is not locked. Wait if necessary. + :param str volume_uuid: The volume uuid to check. + :param int timeout: If the volume is always locked after the expiration + of the timeout, an exception is thrown. + """ + return self.ensure_volume_list_is_not_locked([volume_uuid], timeout) + + def ensure_volume_list_is_not_locked(self, volume_uuids, timeout=None): + checked = set() + for volume_uuid in volume_uuids: + if volume_uuid in self._volumes: + checked.add(volume_uuid) + + if not checked: + return + + waiting = False + + volume_properties = self._get_kv_cache() + + start = time.time() + while True: + # Can't delete in for loop, use a copy of the list. + remaining = checked.copy() + for volume_uuid in checked: + volume_properties.namespace = \ + self._build_volume_namespace(volume_uuid) + timestamp = volume_properties.get( + self.PROP_IS_READONLY_TIMESTAMP + ) + if timestamp is None: + remaining.remove(volume_uuid) + continue + + now = time.time() + if now - float(timestamp) > self.LOCKED_EXPIRATION_DELAY: + self._logger( + 'Remove readonly timestamp on {}'.format(volume_uuid) + ) + volume_properties.pop(self.PROP_IS_READONLY_TIMESTAMP) + remaining.remove(volume_uuid) + continue + + if not waiting: + self._logger( + 'Volume {} is locked, waiting...'.format(volume_uuid) + ) + waiting = True + break + + if not remaining: + break + checked = remaining + + if timeout is not None and now - start > timeout: + raise LinstorVolumeManagerError( + 'volume `{}` is locked and timeout has been reached' + .format(volume_uuid), + LinstorVolumeManagerError.ERR_VOLUME_NOT_EXISTS + ) + + # We must wait to use the volume. After that we can modify it + # ONLY if the SR is locked to avoid bad reads on the slaves. + time.sleep(1) + volume_properties = self._create_kv_cache() + + if waiting: + self._logger('No volume locked now!') + + def remove_volume_if_diskless(self, volume_uuid): + """ + Remove disless path from local node. + :param str volume_uuid: The volume uuid to remove. + """ + + self._ensure_volume_exists(volume_uuid) + + volume_properties = self._get_volume_properties(volume_uuid) + volume_name = volume_properties.get(self.PROP_VOLUME_NAME) + + node_name = socket.gethostname() + result = self._linstor.resource_delete_if_diskless( + node_name=node_name, rsc_name=volume_name + ) + if not linstor.Linstor.all_api_responses_no_error(result): + raise LinstorVolumeManagerError( + 'Unable to delete diskless path of `{}` on node `{}`: {}' + .format(volume_name, node_name, ', '.join( + [str(x) for x in result])) + ) + + def introduce_volume(self, volume_uuid): + pass # TODO: Implement me. + + def resize_volume(self, volume_uuid, new_size): + """ + Resize a volume. + :param str volume_uuid: The volume uuid to resize. + :param int new_size: New size in B. + """ + + volume_name = self.get_volume_name(volume_uuid) + self.ensure_volume_is_not_locked(volume_uuid) + new_size = self.round_up_volume_size(new_size) + + result = self._linstor.volume_dfn_modify( + rsc_name=volume_name, + volume_nr=0, + size=new_size / 1024 + ) + + self._mark_resource_cache_as_dirty() + + error_str = self._get_error_str(result) + if error_str: + raise LinstorVolumeManagerError( + 'Could not resize volume `{}` from SR `{}`: {}' + .format(volume_uuid, self._group_name, error_str) + ) + + def get_volume_name(self, volume_uuid): + """ + Get the name of a particular volume. + :param str volume_uuid: The volume uuid of the name to get. + :return: The volume name. + :rtype: str + """ + + self._ensure_volume_exists(volume_uuid) + volume_properties = self._get_volume_properties(volume_uuid) + volume_name = volume_properties.get(self.PROP_VOLUME_NAME) + if volume_name: + return volume_name + raise LinstorVolumeManagerError( + 'Failed to get volume name of {}'.format(volume_uuid) + ) + + def get_volume_size(self, volume_uuid): + """ + Get the size of a particular volume. + :param str volume_uuid: The volume uuid of the size to get. + :return: The volume size. + :rtype: int + """ + + volume_name = self.get_volume_name(volume_uuid) + dfns = self._linstor.resource_dfn_list_raise( + query_volume_definitions=True, + filter_by_resource_definitions=[volume_name] + ).resource_definitions + + size = dfns[0].volume_definitions[0].size + if size < 0: + raise LinstorVolumeManagerError( + 'Failed to get volume size of: {}'.format(volume_uuid) + ) + return size * 1024 + + + def set_auto_promote_timeout(self, volume_uuid, timeout): + """ + Define the blocking time of open calls when a DRBD + is already open on another host. + :param str volume_uuid: The volume uuid to modify. + """ + + volume_name = self.get_volume_name(volume_uuid) + result = self._linstor.resource_dfn_modify(volume_name, { + 'DrbdOptions/Resource/auto-promote-timeout': timeout + }) + error_str = self._get_error_str(result) + if error_str: + raise LinstorVolumeManagerError( + 'Could not change the auto promote timeout of `{}`: {}' + .format(volume_uuid, error_str) + ) + + def set_ping_timeout(self, volume_uuid, timeout): + """ + Set the response time to answer a DRBD ping packet. + :param str volume_uuid: The volume uuid to modify. + """ + + volume_name = self.get_volume_name(volume_uuid) + result = self._linstor.resource_dfn_modify(volume_name, { + 'DrbdOptions/Net/ping-timeout': timeout + }) + error_str = self._get_error_str(result) + if error_str: + raise LinstorVolumeManagerError( + 'Could not change the ping timeout of `{}`: {}' + .format(volume_uuid, error_str) + ) + + def get_volume_info(self, volume_uuid): + """ + Get the volume info of a particular volume. + :param str volume_uuid: The volume uuid of the volume info to get. + :return: The volume info. + :rtype: VolumeInfo + """ + + volume_name = self.get_volume_name(volume_uuid) + return self._get_volumes_info()[volume_name] + + def get_device_path(self, volume_uuid): + """ + Get the dev path of a volume, create a diskless if necessary. + :param str volume_uuid: The volume uuid to get the dev path. + :return: The current device path of the volume. + :rtype: str + """ + + volume_name = self.get_volume_name(volume_uuid) + return self._find_device_path(volume_uuid, volume_name) + + def get_volume_uuid_from_device_path(self, device_path): + """ + Get the volume uuid of a device_path. + :param str device_path: The dev path to find the volume uuid. + :return: The volume uuid of the local device path. + :rtype: str + """ + + expected_volume_name = \ + self.get_volume_name_from_device_path(device_path) + + volume_names = self.get_volumes_with_name() + for volume_uuid, volume_name in volume_names.items(): + if volume_name == expected_volume_name: + return volume_uuid + + raise LinstorVolumeManagerError( + 'Unable to find volume uuid from dev path `{}`'.format(device_path) + ) + + def get_volume_name_from_device_path(self, device_path): + """ + Get the volume name of a device_path. + :param str device_path: The dev path to find the volume name. + :return: The volume name of the device path. + :rtype: str + """ + + # Assume that we have a path like this: + # - "/dev/drbd/by-res/xcp-volume-/0" + # - "../xcp-volume-/0" + if device_path.startswith(DRBD_BY_RES_PATH): + prefix_len = len(DRBD_BY_RES_PATH) + else: + assert device_path.startswith('../') + prefix_len = 3 + + res_name_end = device_path.find('/', prefix_len) + assert res_name_end != -1 + return device_path[prefix_len:res_name_end] + + def update_volume_uuid(self, volume_uuid, new_volume_uuid, force=False): + """ + Change the uuid of a volume. + :param str volume_uuid: The volume to modify. + :param str new_volume_uuid: The new volume uuid to use. + :param bool force: If true we doesn't check if volume_uuid is in the + volume list. I.e. the volume can be marked as deleted but the volume + can still be in the LINSTOR KV store if the deletion has failed. + In specific cases like "undo" after a failed clone we must rename a bad + deleted VDI. + """ + + assert volume_uuid != new_volume_uuid + + self._logger( + 'Trying to update volume UUID {} to {}...' + .format(volume_uuid, new_volume_uuid) + ) + if not force: + self._ensure_volume_exists(volume_uuid) + self.ensure_volume_is_not_locked(volume_uuid) + + if new_volume_uuid in self._volumes: + raise LinstorVolumeManagerError( + 'Volume `{}` already exists'.format(new_volume_uuid), + LinstorVolumeManagerError.ERR_VOLUME_EXISTS + ) + + volume_properties = self._get_volume_properties(volume_uuid) + if volume_properties.get(self.PROP_UPDATING_UUID_SRC): + raise LinstorVolumeManagerError( + 'Cannot update volume uuid {}: invalid state' + .format(volume_uuid) + ) + + # 1. Copy in temp variables metadata and volume_name. + metadata = volume_properties.get(self.PROP_METADATA) + volume_name = volume_properties.get(self.PROP_VOLUME_NAME) + + # 2. Switch to new volume namespace. + volume_properties.namespace = self._build_volume_namespace( + new_volume_uuid + ) + + if list(volume_properties.items()): + raise LinstorVolumeManagerError( + 'Cannot update volume uuid {} to {}: ' + .format(volume_uuid, new_volume_uuid) + + 'this last one is not empty' + ) + + try: + # 3. Mark new volume properties with PROP_UPDATING_UUID_SRC. + # If we crash after that, the new properties can be removed + # properly. + volume_properties[self.PROP_NOT_EXISTS] = self.STATE_NOT_EXISTS + volume_properties[self.PROP_UPDATING_UUID_SRC] = volume_uuid + + # 4. Copy the properties. + # Note: On new volumes, during clone for example, the metadata + # may be missing. So we must test it to avoid this error: + # "None has to be a str/unicode, but is " + if metadata: + volume_properties[self.PROP_METADATA] = metadata + volume_properties[self.PROP_VOLUME_NAME] = volume_name + + # 5. Ok! + volume_properties[self.PROP_NOT_EXISTS] = self.STATE_EXISTS + except Exception as e: + try: + # Clear the new volume properties in case of failure. + assert volume_properties.namespace == \ + self._build_volume_namespace(new_volume_uuid) + volume_properties.clear() + except Exception as e: + self._logger( + 'Failed to clear new volume properties: {} (ignoring...)' + .format(e) + ) + raise LinstorVolumeManagerError( + 'Failed to copy volume properties: {}'.format(e) + ) + + try: + # 6. After this point, it's ok we can remove the + # PROP_UPDATING_UUID_SRC property and clear the src properties + # without problems. + + # 7. Switch to old volume namespace. + volume_properties.namespace = self._build_volume_namespace( + volume_uuid + ) + volume_properties.clear() + + # 8. Switch a last time to new volume namespace. + volume_properties.namespace = self._build_volume_namespace( + new_volume_uuid + ) + volume_properties.pop(self.PROP_UPDATING_UUID_SRC) + except Exception as e: + raise LinstorVolumeManagerError( + 'Failed to clear volume properties ' + 'after volume uuid update: {}'.format(e) + ) + + self._volumes.remove(volume_uuid) + self._volumes.add(new_volume_uuid) + + self._logger( + 'UUID update succeeded of {} to {}! (properties={})' + .format( + volume_uuid, new_volume_uuid, + self._get_filtered_properties(volume_properties) + ) + ) + + def update_volume_name(self, volume_uuid, volume_name): + """ + Change the volume name of a volume. + :param str volume_uuid: The volume to modify. + :param str volume_name: The volume_name to use. + """ + + self._ensure_volume_exists(volume_uuid) + self.ensure_volume_is_not_locked(volume_uuid) + if not volume_name.startswith(self.PREFIX_VOLUME): + raise LinstorVolumeManagerError( + 'Volume name `{}` must be start with `{}`' + .format(volume_name, self.PREFIX_VOLUME) + ) + + if volume_name not in self._fetch_resource_names(): + raise LinstorVolumeManagerError( + 'Volume `{}` doesn\'t exist'.format(volume_name) + ) + + volume_properties = self._get_volume_properties(volume_uuid) + volume_properties[self.PROP_VOLUME_NAME] = volume_name + + def get_usage_states(self, volume_uuid): + """ + Check if a volume is currently used. + :param str volume_uuid: The volume uuid to check. + :return: A dictionnary that contains states. + :rtype: dict(str, bool or None) + """ + + states = {} + + volume_name = self.get_volume_name(volume_uuid) + for resource_state in self._linstor.resource_list_raise( + filter_by_resources=[volume_name] + ).resource_states: + states[resource_state.node_name] = resource_state.in_use + + return states + + def get_volume_openers(self, volume_uuid): + """ + Get openers of a volume. + :param str volume_uuid: The volume uuid to monitor. + :return: A dictionnary that contains openers. + :rtype: dict(str, obj) + """ + return get_all_volume_openers(self.get_volume_name(volume_uuid), '0') + + + def get_volumes_with_name(self): + """ + Give a volume dictionnary that contains names actually owned. + :return: A volume/name dict. + :rtype: dict(str, str) + """ + return self._get_volumes_by_property(self.REG_VOLUME_NAME) + + def get_volumes_with_info(self): + """ + Give a volume dictionnary that contains VolumeInfos. + :return: A volume/VolumeInfo dict. + :rtype: dict(str, VolumeInfo) + """ + + volumes = {} + + all_volume_info = self._get_volumes_info() + volume_names = self.get_volumes_with_name() + for volume_uuid, volume_name in volume_names.items(): + if volume_name: + volume_info = all_volume_info.get(volume_name) + if volume_info: + volumes[volume_uuid] = volume_info + continue + + # Well I suppose if this volume is not available, + # LINSTOR has been used directly without using this API. + volumes[volume_uuid] = self.VolumeInfo('') + + return volumes + + def get_volumes_with_metadata(self): + """ + Give a volume dictionnary that contains metadata. + :return: A volume/metadata dict. + :rtype: dict(str, dict) + """ + + volumes = {} + + metadata = self._get_volumes_by_property(self.REG_METADATA) + for volume_uuid, volume_metadata in metadata.items(): + if volume_metadata: + volume_metadata = json.loads(volume_metadata) + if isinstance(volume_metadata, dict): + volumes[volume_uuid] = volume_metadata + continue + raise LinstorVolumeManagerError( + 'Expected dictionary in volume metadata: {}' + .format(volume_uuid) + ) + + volumes[volume_uuid] = {} + + return volumes + + def get_volume_metadata(self, volume_uuid): + """ + Get the metadata of a volume. + :return: Dictionary that contains metadata. + :rtype: dict + """ + + self._ensure_volume_exists(volume_uuid) + volume_properties = self._get_volume_properties(volume_uuid) + metadata = volume_properties.get(self.PROP_METADATA) + if metadata: + metadata = json.loads(metadata) + if isinstance(metadata, dict): + return metadata + raise LinstorVolumeManagerError( + 'Expected dictionary in volume metadata: {}' + .format(volume_uuid) + ) + return {} + + def set_volume_metadata(self, volume_uuid, metadata): + """ + Set the metadata of a volume. + :param dict metadata: Dictionary that contains metadata. + """ + + self._ensure_volume_exists(volume_uuid) + self.ensure_volume_is_not_locked(volume_uuid) + + assert isinstance(metadata, dict) + volume_properties = self._get_volume_properties(volume_uuid) + volume_properties[self.PROP_METADATA] = json.dumps(metadata) + + def update_volume_metadata(self, volume_uuid, metadata): + """ + Update the metadata of a volume. It modify only the given keys. + It doesn't remove unreferenced key instead of set_volume_metadata. + :param dict metadata: Dictionary that contains metadata. + """ + + self._ensure_volume_exists(volume_uuid) + self.ensure_volume_is_not_locked(volume_uuid) + + assert isinstance(metadata, dict) + volume_properties = self._get_volume_properties(volume_uuid) + + current_metadata = json.loads( + volume_properties.get(self.PROP_METADATA, '{}') + ) + if not isinstance(metadata, dict): + raise LinstorVolumeManagerError( + 'Expected dictionary in volume metadata: {}' + .format(volume_uuid) + ) + + for key, value in metadata.items(): + current_metadata[key] = value + volume_properties[self.PROP_METADATA] = json.dumps(current_metadata) + + def shallow_clone_volume(self, volume_uuid, clone_uuid, persistent=True): + """ + Clone a volume. Do not copy the data, this method creates a new volume + with the same size. It tries to create the volume on the same host + than volume source. + :param str volume_uuid: The volume to clone. + :param str clone_uuid: The cloned volume. + :param bool persistent: If false the volume will be unavailable + on the next constructor call LinstorSR(...). + :return: The current device path of the cloned volume. + :rtype: str + """ + + volume_name = self.get_volume_name(volume_uuid) + self.ensure_volume_is_not_locked(volume_uuid) + + # 1. Find ideal nodes + size to use. + ideal_node_names, size = self._get_volume_node_names_and_size( + volume_name + ) + if size <= 0: + raise LinstorVolumeManagerError( + 'Invalid size of {} for volume `{}`'.format(size, volume_name) + ) + + # 2. Find the node(s) with the maximum space. + candidates = self._find_best_size_candidates() + if not candidates: + raise LinstorVolumeManagerError( + 'Unable to shallow clone volume `{}`, no free space found.' + ) + + # 3. Compute node names and search if we can try to clone + # on the same nodes than volume. + def find_best_nodes(): + for candidate in candidates: + for node_name in candidate.node_names: + if node_name in ideal_node_names: + return candidate.node_names + + node_names = find_best_nodes() + if not node_names: + node_names = candidates[0].node_names + + if len(node_names) < self._redundancy: + raise LinstorVolumeManagerError( + 'Unable to shallow clone volume `{}`, '.format(volume_uuid) + + '{} are required to clone, found: {}'.format( + self._redundancy, len(node_names) + ) + ) + + # 4. Compute resources to create. + clone_volume_name = self.build_volume_name(util.gen_uuid()) + diskless_node_names = self._get_node_names() + resources = [] + for node_name in node_names: + diskless_node_names.remove(node_name) + resources.append(linstor.ResourceData( + node_name=node_name, + rsc_name=clone_volume_name, + storage_pool=self._group_name + )) + + # 5. Create resources! + def clean(): + try: + self._destroy_volume(clone_uuid, force=True) + except Exception as e: + self._logger( + 'Unable to destroy volume {} after shallow clone fail: {}' + .format(clone_uuid, e) + ) + + def create(): + # Note: placed outside try/except block because we create only definition first. + # There is no reason to call `clean` before the real resource creation. + volume_properties = self._create_volume_with_properties( + clone_uuid, clone_volume_name, size, place_resources=False + ) + + # After this point, `clean` can be called for any fail because the clone UUID + # is really unique. No risk to remove existing data. + try: + result = self._linstor.resource_create(resources) + error_str = self._get_error_str(result) + if error_str: + raise LinstorVolumeManagerError( + 'Could not create cloned volume `{}` of `{}` from ' + 'SR `{}`: {}'.format( + clone_uuid, volume_uuid, self._group_name, + error_str + ) + ) + return volume_properties + except Exception: + clean() + raise + + # Retry because we can get errors like this: + # "Resource disappeared while waiting for it to be ready" or + # "Resource did not became ready on node 'XXX' within reasonable time, check Satellite for errors." + # in the LINSTOR server. + volume_properties = util.retry(create, maxretry=5) + + try: + device_path = self._find_device_path(clone_uuid, clone_volume_name) + if persistent: + volume_properties[self.PROP_NOT_EXISTS] = self.STATE_EXISTS + self._volumes.add(clone_uuid) + return device_path + except Exception as e: + clean() + raise + + def remove_resourceless_volumes(self): + """ + Remove all volumes without valid or non-empty name + (i.e. without LINSTOR resource). It's different than + LinstorVolumeManager constructor that takes a `repair` param that + removes volumes with `PROP_NOT_EXISTS` to 1. + """ + + resource_names = self._fetch_resource_names() + for volume_uuid, volume_name in self.get_volumes_with_name().items(): + if not volume_name or volume_name not in resource_names: + # Don't force, we can be sure of what's happening. + self.destroy_volume(volume_uuid) + + def destroy(self): + """ + Destroy this SR. Object should not be used after that. + :param bool force: Try to destroy volumes before if true. + """ + + if self._volumes: + raise LinstorVolumeManagerError( + 'Cannot destroy LINSTOR volume manager: ' + 'It exists remaining volumes' + ) + + controller_is_running = self._controller_is_running() + uri = 'linstor://localhost' + try: + if controller_is_running: + self._start_controller(start=False) + + # 1. Umount LINSTOR database. + self._mount_database_volume( + self.build_device_path(DATABASE_VOLUME_NAME), + mount=False, + force=True + ) + + # 2. Refresh instance. + self._start_controller(start=True) + self._linstor = self._create_linstor_instance( + uri, keep_uri_unmodified=True + ) + + # 3. Destroy database volume. + self._destroy_resource(DATABASE_VOLUME_NAME) + + # 4. Destroy group and storage pools. + self._destroy_resource_group(self._linstor, self._group_name) + for pool in self._get_storage_pools(force=True): + self._destroy_storage_pool( + self._linstor, pool.name, pool.node_name + ) + except Exception as e: + self._start_controller(start=controller_is_running) + raise e + + try: + self._start_controller(start=False) + for file in glob.glob(DATABASE_PATH + '/'): + os.remove(file) + except Exception as e: + util.SMlog( + 'Ignoring failure after LINSTOR SR destruction: {}' + .format(e) + ) + + def find_up_to_date_diskful_nodes(self, volume_uuid): + """ + Find all nodes that contain a specific volume using diskful disks. + The disk must be up to data to be used. + :param str volume_uuid: The volume to use. + :return: The available nodes. + :rtype: tuple(set(str), str) + """ + + volume_name = self.get_volume_name(volume_uuid) + + in_use_by = None + node_names = set() + + resource_states = filter( + lambda resource_state: resource_state.name == volume_name, + self._get_resource_cache().resource_states + ) + + for resource_state in resource_states: + volume_state = resource_state.volume_states[0] + if volume_state.disk_state == 'UpToDate': + node_names.add(resource_state.node_name) + if resource_state.in_use: + in_use_by = resource_state.node_name + + return (node_names, in_use_by) + + def invalidate_resource_cache(self): + """ + If resources are impacted by external commands like vhdutil, + it's necessary to call this function to invalidate current resource + cache. + """ + self._mark_resource_cache_as_dirty() + + def has_node(self, node_name): + """ + Check if a node exists in the LINSTOR database. + :rtype: bool + """ + result = self._linstor.node_list() + error_str = self._get_error_str(result) + if error_str: + raise LinstorVolumeManagerError( + 'Failed to list nodes using `{}`: {}' + .format(node_name, error_str) + ) + return bool(result[0].node(node_name)) + + def create_node(self, node_name, ip): + """ + Create a new node in the LINSTOR database. + :param str node_name: Node name to use. + :param str ip: Host IP to communicate. + """ + result = self._linstor.node_create( + node_name, + linstor.consts.VAL_NODE_TYPE_CMBD, + ip + ) + errors = self._filter_errors(result) + if errors: + error_str = self._get_error_str(errors) + raise LinstorVolumeManagerError( + 'Failed to create node `{}`: {}'.format(node_name, error_str) + ) + + def destroy_node(self, node_name): + """ + Destroy a node in the LINSTOR database. + :param str node_name: Node name to remove. + """ + result = self._linstor.node_delete(node_name) + errors = self._filter_errors(result) + if errors: + error_str = self._get_error_str(errors) + raise LinstorVolumeManagerError( + 'Failed to destroy node `{}`: {}'.format(node_name, error_str) + ) + + def get_nodes_info(self): + """ + Get all nodes + statuses, used or not by the pool. + :rtype: dict(str, dict) + """ + try: + nodes = {} + for node in self._linstor.node_list_raise().nodes: + nodes[node.name] = node.connection_status + return nodes + except Exception as e: + raise LinstorVolumeManagerError( + 'Failed to get all nodes: `{}`'.format(e) + ) + + def get_storage_pools_info(self): + """ + Give all storage pools of current group name. + :rtype: dict(str, list) + """ + storage_pools = {} + for pool in self._get_storage_pools(force=True): + if pool.node_name not in storage_pools: + storage_pools[pool.node_name] = [] + + size = -1 + capacity = -1 + + space = pool.free_space + if space: + size = space.free_capacity + if size < 0: + size = -1 + else: + size *= 1024 + capacity = space.total_capacity + if capacity <= 0: + capacity = -1 + else: + capacity *= 1024 + + storage_pools[pool.node_name].append({ + 'storage-pool-name': pool.name, + 'uuid': pool.uuid, + 'free-size': size, + 'capacity': capacity + }) + + return storage_pools + + def get_resources_info(self): + """ + Give all resources of current group name. + :rtype: dict(str, list) + """ + resources = {} + resource_list = self._linstor.resource_list_raise() + for resource in resource_list.resources: + if resource.name not in resources: + resources[resource.name] = {} + + resources[resource.name][resource.node_name] = { + 'volumes': [], + 'diskful': linstor.consts.FLAG_DISKLESS not in resource.flags, + 'tie-breaker': linstor.consts.FLAG_TIE_BREAKER in resource.flags + } + + for volume in resource.volumes: + # We ignore diskless pools of the form "DfltDisklessStorPool". + if volume.storage_pool_name != self._group_name: + continue + + usable_size = volume.usable_size + if usable_size < 0: + usable_size = -1 + else: + usable_size *= 1024 + + allocated_size = volume.allocated_size + if allocated_size < 0: + allocated_size = -1 + else: + allocated_size *= 1024 + + resources[resource.name][resource.node_name]['volumes'].append({ + 'storage-pool-name': volume.storage_pool_name, + 'uuid': volume.uuid, + 'number': volume.number, + 'device-path': volume.device_path, + 'usable-size': usable_size, + 'allocated-size': allocated_size + }) + + for resource_state in resource_list.resource_states: + resource = resources[resource_state.rsc_name][resource_state.node_name] + resource['in-use'] = resource_state.in_use + + volumes = resource['volumes'] + for volume_state in resource_state.volume_states: + volume = next((x for x in volumes if x['number'] == volume_state.number), None) + if volume: + volume['disk-state'] = volume_state.disk_state + + return resources + + def get_database_path(self): + """ + Get the database path. + :return: The current database path. + :rtype: str + """ + return self._request_database_path(self._linstor) + + @classmethod + def create_sr( + cls, group_name, ips, redundancy, + thin_provisioning, auto_quorum, + logger=default_logger.__func__ + ): + """ + Create a new SR on the given nodes. + :param str group_name: The SR group_name to use. + :param set(str) ips: Node ips. + :param int redundancy: How many copy of volumes should we store? + :param bool thin_provisioning: Use thin or thick provisioning. + :param bool auto_quorum: DB quorum is monitored by LINSTOR. + :param function logger: Function to log messages. + :return: A new LinstorSr instance. + :rtype: LinstorSr + """ + + try: + cls._start_controller(start=True) + sr = cls._create_sr( + group_name, + ips, + redundancy, + thin_provisioning, + auto_quorum, + logger + ) + finally: + # Controller must be stopped and volume unmounted because + # it is the role of the drbd-reactor daemon to do the right + # actions. + cls._start_controller(start=False) + cls._mount_volume( + cls.build_device_path(DATABASE_VOLUME_NAME), + DATABASE_PATH, + mount=False + ) + return sr + + @classmethod + def _create_sr( + cls, group_name, ips, redundancy, + thin_provisioning, auto_quorum, + logger=default_logger.__func__ + ): + # 1. Check if SR already exists. + uri = 'linstor://localhost' + + lin = cls._create_linstor_instance(uri, keep_uri_unmodified=True) + + node_names = ips.keys() + for node_name, ip in ips.iteritems(): + while True: + # Try to create node. + result = lin.node_create( + node_name, + linstor.consts.VAL_NODE_TYPE_CMBD, + ip + ) + + errors = cls._filter_errors(result) + if cls._check_errors( + errors, [linstor.consts.FAIL_EXISTS_NODE] + ): + # If it already exists, remove, then recreate. + result = lin.node_delete(node_name) + error_str = cls._get_error_str(result) + if error_str: + raise LinstorVolumeManagerError( + 'Failed to remove old node `{}`: {}' + .format(node_name, error_str) + ) + elif not errors: + break # Created! + else: + raise LinstorVolumeManagerError( + 'Failed to create node `{}` with ip `{}`: {}'.format( + node_name, ip, cls._get_error_str(errors) + ) + ) + + driver_pool_name = group_name + base_group_name = group_name + group_name = cls._build_group_name(group_name) + pools = lin.storage_pool_list_raise(filter_by_stor_pools=[group_name]) + pools = pools.storage_pools + if pools: + existing_node_names = map(lambda pool: pool.node_name, pools) + raise LinstorVolumeManagerError( + 'Unable to create SR `{}`. It already exists on node(s): {}' + .format(group_name, existing_node_names) + ) + + if lin.resource_group_list_raise( + [group_name] + ).resource_groups: + raise LinstorVolumeManagerError( + 'Unable to create SR `{}`: The group name already exists' + .format(group_name) + ) + + if thin_provisioning: + driver_pool_parts = driver_pool_name.split('/') + if not len(driver_pool_parts) == 2: + raise LinstorVolumeManagerError( + 'Invalid group name using thin provisioning. ' + 'Expected format: \'VG/LV`\'' + ) + + # 2. Create storage pool on each node + resource group. + reg_volume_group_not_found = re.compile( + ".*Volume group '.*' not found$" + ) + + i = 0 + try: + # 2.a. Create storage pools. + storage_pool_count = 0 + while i < len(node_names): + node_name = node_names[i] + + result = lin.storage_pool_create( + node_name=node_name, + storage_pool_name=group_name, + storage_driver='LVM_THIN' if thin_provisioning else 'LVM', + driver_pool_name=driver_pool_name + ) + + errors = linstor.Linstor.filter_api_call_response_errors( + result + ) + if errors: + if len(errors) == 1 and errors[0].is_error( + linstor.consts.FAIL_STOR_POOL_CONFIGURATION_ERROR + ) and reg_volume_group_not_found.match(errors[0].message): + logger( + 'Volume group `{}` not found on `{}`. Ignoring...' + .format(group_name, node_name) + ) + cls._destroy_storage_pool(lin, group_name, node_name) + else: + error_str = cls._get_error_str(result) + raise LinstorVolumeManagerError( + 'Could not create SP `{}` on node `{}`: {}' + .format(group_name, node_name, error_str) + ) + else: + storage_pool_count += 1 + i += 1 + + if not storage_pool_count: + raise LinstorVolumeManagerError( + 'Unable to create SR `{}`: No VG group found'.format( + group_name, + ) + ) + + # 2.b. Create resource group. + result = lin.resource_group_create( + name=group_name, + place_count=redundancy, + storage_pool=group_name, + diskless_on_remaining=False + ) + error_str = cls._get_error_str(result) + if error_str: + raise LinstorVolumeManagerError( + 'Could not create RG `{}`: {}'.format( + group_name, error_str + ) + ) + + # 2.c. Create volume group. + result = lin.volume_group_create(group_name) + error_str = cls._get_error_str(result) + if error_str: + raise LinstorVolumeManagerError( + 'Could not create VG `{}`: {}'.format( + group_name, error_str + ) + ) + + # 3. Create the LINSTOR database volume and mount it. + try: + logger('Creating database volume...') + volume_path = cls._create_database_volume( + lin, group_name, node_names, redundancy, auto_quorum + ) + except LinstorVolumeManagerError as e: + if e.code != LinstorVolumeManagerError.ERR_VOLUME_EXISTS: + logger('Destroying database volume after creation fail...') + cls._force_destroy_database_volume(lin, group_name) + raise + + try: + logger('Mounting database volume...') + + # First we must disable the controller to move safely the + # LINSTOR config. + cls._start_controller(start=False) + + cls._mount_database_volume(volume_path) + except Exception as e: + # Ensure we are connected because controller has been + # restarted during mount call. + logger('Destroying database volume after mount fail...') + + try: + cls._start_controller(start=True) + except Exception: + pass + + lin = cls._create_linstor_instance( + uri, keep_uri_unmodified=True + ) + cls._force_destroy_database_volume(lin, group_name) + raise e + + cls._start_controller(start=True) + lin = cls._create_linstor_instance(uri, keep_uri_unmodified=True) + + # 4. Remove storage pools/resource/volume group in the case of errors. + except Exception as e: + logger('Destroying resource group and storage pools after fail...') + try: + cls._destroy_resource_group(lin, group_name) + except Exception as e2: + logger('Failed to destroy resource group: {}'.format(e2)) + pass + j = 0 + i = min(i, len(node_names) - 1) + while j <= i: + try: + cls._destroy_storage_pool(lin, group_name, node_names[j]) + except Exception as e2: + logger('Failed to destroy resource group: {}'.format(e2)) + pass + j += 1 + raise e + + # 5. Return new instance. + instance = cls.__new__(cls) + instance._linstor = lin + instance._logger = logger + instance._redundancy = redundancy + instance._base_group_name = base_group_name + instance._group_name = group_name + instance._volumes = set() + instance._storage_pools_time = 0 + instance._kv_cache = instance._create_kv_cache() + instance._resource_cache = None + instance._resource_cache_dirty = True + instance._volume_info_cache = None + instance._volume_info_cache_dirty = True + return instance + + @classmethod + def build_device_path(cls, volume_name): + """ + Build a device path given a volume name. + :param str volume_name: The volume name to use. + :return: A valid or not device path. + :rtype: str + """ + + return '{}{}/0'.format(cls.DEV_ROOT_PATH, volume_name) + + @classmethod + def build_volume_name(cls, base_name): + """ + Build a volume name given a base name (i.e. a UUID). + :param str volume_name: The volume name to use. + :return: A valid or not device path. + :rtype: str + """ + return '{}{}'.format(cls.PREFIX_VOLUME, base_name) + + @classmethod + def round_up_volume_size(cls, volume_size): + """ + Align volume size on higher multiple of BLOCK_SIZE. + :param int volume_size: The volume size to align. + :return: An aligned volume size. + :rtype: int + """ + return round_up(volume_size, cls.BLOCK_SIZE) + + @classmethod + def round_down_volume_size(cls, volume_size): + """ + Align volume size on lower multiple of BLOCK_SIZE. + :param int volume_size: The volume size to align. + :return: An aligned volume size. + :rtype: int + """ + return round_down(volume_size, cls.BLOCK_SIZE) + + # -------------------------------------------------------------------------- + # Private helpers. + # -------------------------------------------------------------------------- + + def _create_kv_cache(self): + self._kv_cache = self._create_linstor_kv('/') + self._kv_cache_dirty = False + return self._kv_cache + + def _get_kv_cache(self): + if self._kv_cache_dirty: + self._kv_cache = self._create_kv_cache() + return self._kv_cache + + def _create_resource_cache(self): + self._resource_cache = self._linstor.resource_list_raise() + self._resource_cache_dirty = False + return self._resource_cache + + def _get_resource_cache(self): + if self._resource_cache_dirty: + self._resource_cache = self._create_resource_cache() + return self._resource_cache + + def _mark_resource_cache_as_dirty(self): + self._resource_cache_dirty = True + self._volume_info_cache_dirty = True + + # -------------------------------------------------------------------------- + + def _ensure_volume_exists(self, volume_uuid): + if volume_uuid not in self._volumes: + raise LinstorVolumeManagerError( + 'volume `{}` doesn\'t exist'.format(volume_uuid), + LinstorVolumeManagerError.ERR_VOLUME_NOT_EXISTS + ) + + def _find_best_size_candidates(self): + result = self._linstor.resource_group_qmvs(self._group_name) + error_str = self._get_error_str(result) + if error_str: + raise LinstorVolumeManagerError( + 'Failed to get max volume size allowed of SR `{}`: {}'.format( + self._group_name, + error_str + ) + ) + return result[0].candidates + + def _fetch_resource_names(self): + resource_names = set() + dfns = self._linstor.resource_dfn_list_raise().resource_definitions + for dfn in dfns: + if dfn.resource_group_name == self._group_name and \ + linstor.consts.FLAG_DELETE not in dfn.flags: + resource_names.add(dfn.name) + return resource_names + + def _get_volumes_info(self, volume_name=None): + all_volume_info = {} + + if not self._volume_info_cache_dirty: + return self._volume_info_cache + + for resource in self._get_resource_cache().resources: + if resource.name not in all_volume_info: + current = all_volume_info[resource.name] = self.VolumeInfo( + resource.name + ) + else: + current = all_volume_info[resource.name] + + if linstor.consts.FLAG_DISKLESS not in resource.flags: + current.diskful.append(resource.node_name) + + for volume in resource.volumes: + # We ignore diskless pools of the form "DfltDisklessStorPool". + if volume.storage_pool_name == self._group_name: + if volume.allocated_size < 0: + raise LinstorVolumeManagerError( + 'Failed to get allocated size of `{}` on `{}`' + .format(resource.name, volume.storage_pool_name) + ) + allocated_size = volume.allocated_size + + current.allocated_size = current.allocated_size and \ + max(current.allocated_size, allocated_size) or \ + allocated_size + + usable_size = volume.usable_size + if usable_size > 0 and ( + usable_size < current.virtual_size or + not current.virtual_size + ): + current.virtual_size = usable_size + + if current.virtual_size <= 0: + raise LinstorVolumeManagerError( + 'Failed to get usable size of `{}` on `{}`' + .format(resource.name, volume.storage_pool_name) + ) + + for current in all_volume_info.values(): + current.allocated_size *= 1024 + current.virtual_size *= 1024 + + self._volume_info_cache_dirty = False + self._volume_info_cache = all_volume_info + + return all_volume_info + + def _get_volume_node_names_and_size(self, volume_name): + node_names = set() + size = -1 + for resource in self._linstor.resource_list_raise( + filter_by_resources=[volume_name] + ).resources: + for volume in resource.volumes: + # We ignore diskless pools of the form "DfltDisklessStorPool". + if volume.storage_pool_name == self._group_name: + node_names.add(resource.node_name) + + current_size = volume.usable_size + if current_size < 0: + raise LinstorVolumeManagerError( + 'Failed to get usable size of `{}` on `{}`' + .format(resource.name, volume.storage_pool_name) + ) + + if size < 0: + size = current_size + else: + size = min(size, current_size) + + return (node_names, size * 1024) + + def _compute_size(self, attr): + capacity = 0 + for pool in self._get_storage_pools(force=True): + space = pool.free_space + if space: + size = getattr(space, attr) + if size < 0: + raise LinstorVolumeManagerError( + 'Failed to get pool {} attr of `{}`' + .format(attr, pool.node_name) + ) + capacity += size + return capacity * 1024 + + def _get_node_names(self): + node_names = set() + for pool in self._get_storage_pools(): + node_names.add(pool.node_name) + return node_names + + def _get_storage_pools(self, force=False): + cur_time = time.time() + elsaped_time = cur_time - self._storage_pools_time + + if force or elsaped_time >= self.STORAGE_POOLS_FETCH_INTERVAL: + self._storage_pools = self._linstor.storage_pool_list_raise( + filter_by_stor_pools=[self._group_name] + ).storage_pools + self._storage_pools_time = time.time() + + return self._storage_pools + + def _create_volume( + self, volume_uuid, volume_name, size, place_resources + ): + size = self.round_up_volume_size(size) + self._mark_resource_cache_as_dirty() + + def create_definition(): + self._check_volume_creation_errors( + self._linstor.resource_group_spawn( + rsc_grp_name=self._group_name, + rsc_dfn_name=volume_name, + vlm_sizes=['{}B'.format(size)], + definitions_only=True + ), + volume_uuid, + self._group_name + ) + self._configure_volume_peer_slots(self._linstor, volume_name) + + def clean(): + try: + self._destroy_volume(volume_uuid, force=True) + except Exception as e: + self._logger( + 'Unable to destroy volume {} after creation fail: {}' + .format(volume_uuid, e) + ) + + def create(): + try: + create_definition() + if place_resources: + # Basic case when we use the default redundancy of the group. + self._check_volume_creation_errors( + self._linstor.resource_auto_place( + rsc_name=volume_name, + place_count=self._redundancy, + diskless_on_remaining=False + ), + volume_uuid, + self._group_name + ) + except LinstorVolumeManagerError as e: + if e.code != LinstorVolumeManagerError.ERR_VOLUME_EXISTS: + clean() + raise + except Exception: + clean() + raise + + util.retry(create, maxretry=5) + + def _create_volume_with_properties( + self, volume_uuid, volume_name, size, place_resources + ): + if self.check_volume_exists(volume_uuid): + raise LinstorVolumeManagerError( + 'Could not create volume `{}` from SR `{}`, it already exists' + .format(volume_uuid, self._group_name) + ' in properties', + LinstorVolumeManagerError.ERR_VOLUME_EXISTS + ) + + if volume_name in self._fetch_resource_names(): + raise LinstorVolumeManagerError( + 'Could not create volume `{}` from SR `{}`, '.format( + volume_uuid, self._group_name + ) + 'resource of the same name already exists in LINSTOR' + ) + + # I am paranoid. + volume_properties = self._get_volume_properties(volume_uuid) + if (volume_properties.get(self.PROP_NOT_EXISTS) is not None): + raise LinstorVolumeManagerError( + 'Could not create volume `{}`, '.format(volume_uuid) + + 'properties already exist' + ) + + try: + volume_properties[self.PROP_NOT_EXISTS] = self.STATE_CREATING + volume_properties[self.PROP_VOLUME_NAME] = volume_name + + self._create_volume( + volume_uuid, volume_name, size, place_resources + ) + + assert volume_properties.namespace == \ + self._build_volume_namespace(volume_uuid) + return volume_properties + except LinstorVolumeManagerError as e: + # Do not destroy existing resource! + # In theory we can't get this error because we check this event + # before the `self._create_volume` case. + # It can only happen if the same volume uuid is used in the same + # call in another host. + if e.code != LinstorVolumeManagerError.ERR_VOLUME_EXISTS: + self._destroy_volume(volume_uuid, force=True) + raise + + def _find_device_path(self, volume_uuid, volume_name): + current_device_path = self._request_device_path( + volume_uuid, volume_name, activate=True + ) + + # We use realpath here to get the /dev/drbd path instead of + # /dev/drbd/by-res/. + expected_device_path = self.build_device_path(volume_name) + util.wait_for_path(expected_device_path, 5) + + device_realpath = os.path.realpath(expected_device_path) + if current_device_path != device_realpath: + raise LinstorVolumeManagerError( + 'Invalid path, current={}, expected={} (realpath={})' + .format( + current_device_path, + expected_device_path, + device_realpath + ) + ) + return expected_device_path + + def _request_device_path(self, volume_uuid, volume_name, activate=False): + node_name = socket.gethostname() + + resources = filter( + lambda resource: resource.node_name == node_name and + resource.name == volume_name, + self._get_resource_cache().resources + ) + + if not resources: + if activate: + self._mark_resource_cache_as_dirty() + self._activate_device_path( + self._linstor, node_name, volume_name + ) + return self._request_device_path(volume_uuid, volume_name) + raise LinstorVolumeManagerError( + 'Empty dev path for `{}`, but definition "seems" to exist' + .format(volume_uuid) + ) + # Contains a path of the /dev/drbd form. + return resources[0].volumes[0].device_path + + def _destroy_resource(self, resource_name, force=False): + result = self._linstor.resource_dfn_delete(resource_name) + error_str = self._get_error_str(result) + if not error_str: + self._mark_resource_cache_as_dirty() + return + + if not force: + self._mark_resource_cache_as_dirty() + raise LinstorVolumeManagerError( + 'Could not destroy resource `{}` from SR `{}`: {}' + .format(resource_name, self._group_name, error_str) + ) + + # If force is used, ensure there is no opener. + all_openers = get_all_volume_openers(resource_name, '0') + for openers in all_openers.itervalues(): + if openers: + self._mark_resource_cache_as_dirty() + raise LinstorVolumeManagerError( + 'Could not force destroy resource `{}` from SR `{}`: {} (openers=`{}`)' + .format(resource_name, self._group_name, error_str, all_openers) + ) + + # Maybe the resource is blocked in primary mode. DRBD/LINSTOR issue? + resource_states = filter( + lambda resource_state: resource_state.name == resource_name, + self._get_resource_cache().resource_states + ) + + # Mark only after computation of states. + self._mark_resource_cache_as_dirty() + + for resource_state in resource_states: + volume_state = resource_state.volume_states[0] + if resource_state.in_use: + demote_drbd_resource(resource_state.node_name, resource_name) + break + self._destroy_resource(resource_name) + + def _destroy_volume(self, volume_uuid, force=False): + volume_properties = self._get_volume_properties(volume_uuid) + try: + volume_name = volume_properties.get(self.PROP_VOLUME_NAME) + if volume_name in self._fetch_resource_names(): + self._destroy_resource(volume_name, force) + + # Assume this call is atomic. + volume_properties.clear() + except Exception as e: + raise LinstorVolumeManagerError( + 'Cannot destroy volume `{}`: {}'.format(volume_uuid, e) + ) + + def _build_volumes(self, repair): + properties = self._kv_cache + resource_names = self._fetch_resource_names() + + self._volumes = set() + + updating_uuid_volumes = self._get_volumes_by_property( + self.REG_UPDATING_UUID_SRC, ignore_inexisting_volumes=False + ) + if updating_uuid_volumes and not repair: + raise LinstorVolumeManagerError( + 'Cannot build LINSTOR volume list: ' + 'It exists invalid "updating uuid volumes", repair is required' + ) + + existing_volumes = self._get_volumes_by_property( + self.REG_NOT_EXISTS, ignore_inexisting_volumes=False + ) + for volume_uuid, not_exists in existing_volumes.items(): + properties.namespace = self._build_volume_namespace(volume_uuid) + + src_uuid = properties.get(self.PROP_UPDATING_UUID_SRC) + if src_uuid: + self._logger( + 'Ignoring volume during manager initialization with prop ' + ' PROP_UPDATING_UUID_SRC: {} (properties={})' + .format( + volume_uuid, + self._get_filtered_properties(properties) + ) + ) + continue + + # Insert volume in list if the volume exists. Or if the volume + # is being created and a slave wants to use it (repair = False). + # + # If we are on the master and if repair is True and state is + # Creating, it's probably a bug or crash: the creation process has + # been stopped. + if not_exists == self.STATE_EXISTS or ( + not repair and not_exists == self.STATE_CREATING + ): + self._volumes.add(volume_uuid) + continue + + if not repair: + self._logger( + 'Ignoring bad volume during manager initialization: {} ' + '(properties={})'.format( + volume_uuid, + self._get_filtered_properties(properties) + ) + ) + continue + + # Remove bad volume. + try: + self._logger( + 'Removing bad volume during manager initialization: {} ' + '(properties={})'.format( + volume_uuid, + self._get_filtered_properties(properties) + ) + ) + volume_name = properties.get(self.PROP_VOLUME_NAME) + + # Little optimization, don't call `self._destroy_volume`, + # we already have resource name list. + if volume_name in resource_names: + self._destroy_resource(volume_name, force=True) + + # Assume this call is atomic. + properties.clear() + except Exception as e: + # Do not raise, we don't want to block user action. + self._logger( + 'Cannot clean volume {}: {}'.format(volume_uuid, e) + ) + + # The volume can't be removed, maybe it's still in use, + # in this case rename it with the "DELETED_" prefix. + # This prefix is mandatory if it exists a snap transaction to + # rollback because the original VDI UUID can try to be renamed + # with the UUID we are trying to delete... + if not volume_uuid.startswith('DELETED_'): + self.update_volume_uuid( + volume_uuid, 'DELETED_' + volume_uuid, force=True + ) + + for dest_uuid, src_uuid in updating_uuid_volumes.items(): + dest_namespace = self._build_volume_namespace(dest_uuid) + + properties.namespace = dest_namespace + if int(properties.get(self.PROP_NOT_EXISTS)): + properties.clear() + continue + + properties.namespace = self._build_volume_namespace(src_uuid) + properties.clear() + + properties.namespace = dest_namespace + properties.pop(self.PROP_UPDATING_UUID_SRC) + + if src_uuid in self._volumes: + self._volumes.remove(src_uuid) + self._volumes.add(dest_uuid) + + def _get_sr_properties(self): + return self._create_linstor_kv(self._build_sr_namespace()) + + def _get_volumes_by_property( + self, reg_prop, ignore_inexisting_volumes=True + ): + base_properties = self._get_kv_cache() + base_properties.namespace = self._build_volume_namespace() + + volume_properties = {} + for volume_uuid in self._volumes: + volume_properties[volume_uuid] = '' + + for key, value in base_properties.items(): + res = reg_prop.match(key) + if res: + volume_uuid = res.groups()[0] + if not ignore_inexisting_volumes or \ + volume_uuid in self._volumes: + volume_properties[volume_uuid] = value + + return volume_properties + + def _create_linstor_kv(self, namespace): + return linstor.KV( + self._group_name, + uri=self._linstor.controller_host(), + namespace=namespace + ) + + def _get_volume_properties(self, volume_uuid): + properties = self._get_kv_cache() + properties.namespace = self._build_volume_namespace(volume_uuid) + return properties + + @classmethod + def _build_sr_namespace(cls): + return '/{}/'.format(cls.NAMESPACE_SR) + + @classmethod + def _build_volume_namespace(cls, volume_uuid=None): + # Return a path to all volumes if `volume_uuid` is not given. + if volume_uuid is None: + return '/{}/'.format(cls.NAMESPACE_VOLUME) + return '/{}/{}/'.format(cls.NAMESPACE_VOLUME, volume_uuid) + + @classmethod + def _get_error_str(cls, result): + return ', '.join([ + err.message for err in cls._filter_errors(result) + ]) + + @classmethod + def _create_linstor_instance( + cls, uri, keep_uri_unmodified=False, attempt_count=30 + ): + retry = False + + def connect(uri): + if not uri: + uri = get_controller_uri() + if not uri: + raise LinstorVolumeManagerError( + 'Unable to find controller uri...' + ) + instance = linstor.Linstor(uri, keep_alive=True) + instance.connect() + return instance + + try: + return connect(uri) + except (linstor.errors.LinstorNetworkError, LinstorVolumeManagerError): + pass + + if not keep_uri_unmodified: + uri = None + + return util.retry( + lambda: connect(uri), + maxretry=attempt_count, + period=1, + exceptions=[ + linstor.errors.LinstorNetworkError, + LinstorVolumeManagerError + ] + ) + + @classmethod + def _configure_volume_peer_slots(cls, lin, volume_name): + result = lin.resource_dfn_modify(volume_name, {}, peer_slots=3) + error_str = cls._get_error_str(result) + if error_str: + raise LinstorVolumeManagerError( + 'Could not configure volume peer slots of {}: {}' + .format(volume_name, error_str) + ) + + @classmethod + def _activate_device_path(cls, lin, node_name, volume_name): + result = lin.resource_make_available(node_name, volume_name, diskful=True) + if linstor.Linstor.all_api_responses_no_error(result): + return + errors = linstor.Linstor.filter_api_call_response_errors(result) + if len(errors) == 1 and errors[0].is_error( + linstor.consts.FAIL_EXISTS_RSC + ): + return + + raise LinstorVolumeManagerError( + 'Unable to activate device path of `{}` on node `{}`: {}' + .format(volume_name, node_name, ', '.join( + [str(x) for x in result])) + ) + + @classmethod + def _request_database_path(cls, lin, activate=False): + node_name = socket.gethostname() + + try: + resources = filter( + lambda resource: resource.node_name == node_name and + resource.name == DATABASE_VOLUME_NAME, + lin.resource_list_raise().resources + ) + except Exception as e: + raise LinstorVolumeManagerError( + 'Unable to get resources during database creation: {}' + .format(e) + ) + + if not resources: + if activate: + cls._activate_device_path( + lin, node_name, DATABASE_VOLUME_NAME + ) + return cls._request_database_path( + DATABASE_VOLUME_NAME, DATABASE_VOLUME_NAME + ) + raise LinstorVolumeManagerError( + 'Empty dev path for `{}`, but definition "seems" to exist' + .format(DATABASE_PATH) + ) + # Contains a path of the /dev/drbd form. + return resources[0].volumes[0].device_path + + @classmethod + def _create_database_volume( + cls, lin, group_name, node_names, redundancy, auto_quorum + ): + try: + dfns = lin.resource_dfn_list_raise().resource_definitions + except Exception as e: + raise LinstorVolumeManagerError( + 'Unable to get definitions during database creation: {}' + .format(e) + ) + + if dfns: + raise LinstorVolumeManagerError( + 'Could not create volume `{}` from SR `{}`, '.format( + DATABASE_VOLUME_NAME, group_name + ) + 'LINSTOR volume list must be empty.' + ) + + # Workaround to use thin lvm. Without this line an error is returned: + # "Not enough available nodes" + # I don't understand why but this command protect against this bug. + try: + pools = lin.storage_pool_list_raise( + filter_by_stor_pools=[group_name] + ) + except Exception as e: + raise LinstorVolumeManagerError( + 'Failed to get storage pool list before database creation: {}' + .format(e) + ) + + # Ensure we have a correct list of storage pools. + nodes_with_pool = map(lambda pool: pool.node_name, pools.storage_pools) + assert nodes_with_pool # We must have at least one storage pool! + for node_name in nodes_with_pool: + assert node_name in node_names + util.SMlog('Nodes with storage pool: {}'.format(nodes_with_pool)) + + # Create the database definition. + size = cls.round_up_volume_size(DATABASE_SIZE) + cls._check_volume_creation_errors(lin.resource_group_spawn( + rsc_grp_name=group_name, + rsc_dfn_name=DATABASE_VOLUME_NAME, + vlm_sizes=['{}B'.format(size)], + definitions_only=True + ), DATABASE_VOLUME_NAME, group_name) + cls._configure_volume_peer_slots(lin, DATABASE_VOLUME_NAME) + + # Create real resources on the first nodes. + resources = [] + + diskful_nodes = [] + diskless_nodes = [] + for node_name in node_names: + if node_name in nodes_with_pool: + diskful_nodes.append(node_name) + else: + diskless_nodes.append(node_name) + + assert diskful_nodes + for node_name in diskful_nodes[:redundancy]: + util.SMlog('Create database diskful on {}'.format(node_name)) + resources.append(linstor.ResourceData( + node_name=node_name, + rsc_name=DATABASE_VOLUME_NAME, + storage_pool=group_name + )) + # Create diskless resources on the remaining set. + for node_name in diskful_nodes[redundancy:] + diskless_nodes: + util.SMlog('Create database diskless on {}'.format(node_name)) + resources.append(linstor.ResourceData( + node_name=node_name, + rsc_name=DATABASE_VOLUME_NAME, + diskless=True + )) + + result = lin.resource_create(resources) + error_str = cls._get_error_str(result) + if error_str: + raise LinstorVolumeManagerError( + 'Could not create database volume from SR `{}`: {}'.format( + group_name, error_str + ) + ) + + # We must modify the quorum. Otherwise we can't use correctly the + # drbd-reactor daemon. + if auto_quorum: + result = lin.resource_dfn_modify(DATABASE_VOLUME_NAME, { + 'DrbdOptions/auto-quorum': 'disabled', + 'DrbdOptions/Resource/quorum': 'majority' + }) + error_str = cls._get_error_str(result) + if error_str: + raise LinstorVolumeManagerError( + 'Could not activate quorum on database volume: {}' + .format(error_str) + ) + + # Create database and ensure path exists locally and + # on replicated devices. + current_device_path = cls._request_database_path(lin, activate=True) + + # Ensure diskless paths exist on other hosts. Otherwise PBDs can't be + # plugged. + for node_name in node_names: + cls._activate_device_path(lin, node_name, DATABASE_VOLUME_NAME) + + # We use realpath here to get the /dev/drbd path instead of + # /dev/drbd/by-res/. + expected_device_path = cls.build_device_path(DATABASE_VOLUME_NAME) + util.wait_for_path(expected_device_path, 5) + + device_realpath = os.path.realpath(expected_device_path) + if current_device_path != device_realpath: + raise LinstorVolumeManagerError( + 'Invalid path, current={}, expected={} (realpath={})' + .format( + current_device_path, + expected_device_path, + device_realpath + ) + ) + + try: + util.retry( + lambda: util.pread2([DATABASE_MKFS, expected_device_path]), + maxretry=5 + ) + except Exception as e: + raise LinstorVolumeManagerError( + 'Failed to execute {} on database volume: {}' + .format(DATABASE_MKFS, e) + ) + + return expected_device_path + + @classmethod + def _destroy_database_volume(cls, lin, group_name): + error_str = cls._get_error_str( + lin.resource_dfn_delete(DATABASE_VOLUME_NAME) + ) + if error_str: + raise LinstorVolumeManagerError( + 'Could not destroy resource `{}` from SR `{}`: {}' + .format(DATABASE_VOLUME_NAME, group_name, error_str) + ) + + @classmethod + def _mount_database_volume(cls, volume_path, mount=True, force=False): + backup_path = DATABASE_PATH + '-' + str(uuid.uuid4()) + + try: + # 1. Create a backup config folder. + database_not_empty = bool(os.listdir(DATABASE_PATH)) + if database_not_empty: + try: + os.mkdir(backup_path) + except Exception as e: + raise LinstorVolumeManagerError( + 'Failed to create backup path {} of LINSTOR config: {}' + .format(backup_path, e) + ) + + # 2. Move the config in the mounted volume. + if database_not_empty: + cls._move_files(DATABASE_PATH, backup_path) + + cls._mount_volume(volume_path, DATABASE_PATH, mount) + + if database_not_empty: + cls._move_files(backup_path, DATABASE_PATH, force) + + # 3. Remove useless backup directory. + try: + os.rmdir(backup_path) + except Exception: + raise LinstorVolumeManagerError( + 'Failed to remove backup path {} of LINSTOR config {}' + .format(backup_path, e) + ) + except Exception as e: + def force_exec(fn): + try: + fn() + except Exception: + pass + + if mount == cls._is_mounted(DATABASE_PATH): + force_exec(lambda: cls._move_files( + DATABASE_PATH, backup_path + )) + force_exec(lambda: cls._mount_volume( + volume_path, DATABASE_PATH, not mount + )) + + if mount != cls._is_mounted(DATABASE_PATH): + force_exec(lambda: cls._move_files( + backup_path, DATABASE_PATH + )) + + force_exec(lambda: os.rmdir(backup_path)) + raise e + + @classmethod + def _force_destroy_database_volume(cls, lin, group_name): + try: + cls._destroy_database_volume(lin, group_name) + except Exception: + pass + + @classmethod + def _destroy_storage_pool(cls, lin, group_name, node_name): + def destroy(): + result = lin.storage_pool_delete(node_name, group_name) + errors = cls._filter_errors(result) + if cls._check_errors(errors, [ + linstor.consts.FAIL_NOT_FOUND_STOR_POOL, + linstor.consts.FAIL_NOT_FOUND_STOR_POOL_DFN + ]): + return + + if errors: + raise LinstorVolumeManagerError( + 'Failed to destroy SP `{}` on node `{}`: {}'.format( + group_name, + node_name, + cls._get_error_str(errors) + ) + ) + + # We must retry to avoid errors like: + # "can not be deleted as volumes / snapshot-volumes are still using it" + # after LINSTOR database volume destruction. + return util.retry(destroy, maxretry=10) + + @classmethod + def _destroy_resource_group(cls, lin, group_name): + def destroy(): + result = lin.resource_group_delete(group_name) + errors = cls._filter_errors(result) + if cls._check_errors(errors, [ + linstor.consts.FAIL_NOT_FOUND_RSC_GRP + ]): + return + + if errors: + raise LinstorVolumeManagerError( + 'Failed to destroy RG `{}`: {}' + .format(group_name, cls._get_error_str(errors)) + ) + + return util.retry(destroy, maxretry=10) + + @classmethod + def _build_group_name(cls, base_name): + # If thin provisioning is used we have a path like this: + # `VG/LV`. "/" is not accepted by LINSTOR. + return '{}{}'.format(cls.PREFIX_SR, base_name.replace('/', '_')) + + @classmethod + def _check_volume_creation_errors(cls, result, volume_uuid, group_name): + errors = cls._filter_errors(result) + if cls._check_errors(errors, [ + linstor.consts.FAIL_EXISTS_RSC, linstor.consts.FAIL_EXISTS_RSC_DFN + ]): + raise LinstorVolumeManagerError( + 'Failed to create volume `{}` from SR `{}`, it already exists' + .format(volume_uuid, group_name), + LinstorVolumeManagerError.ERR_VOLUME_EXISTS + ) + + if errors: + raise LinstorVolumeManagerError( + 'Failed to create volume `{}` from SR `{}`: {}'.format( + volume_uuid, + group_name, + cls._get_error_str(errors) + ) + ) + + @classmethod + def _move_files(cls, src_dir, dest_dir, force=False): + def listdir(dir): + ignored = ['lost+found'] + return filter(lambda file: file not in ignored, os.listdir(dir)) + + try: + if not force: + files = listdir(dest_dir) + if files: + raise LinstorVolumeManagerError( + 'Cannot move files from {} to {} because destination ' + 'contains: {}'.format(src_dir, dest_dir, files) + ) + except LinstorVolumeManagerError: + raise + except Exception as e: + raise LinstorVolumeManagerError( + 'Cannot list dir {}: {}'.format(dest_dir, e) + ) + + try: + for file in listdir(src_dir): + try: + dest_file = os.path.join(dest_dir, file) + if not force and os.path.exists(dest_file): + raise LinstorVolumeManagerError( + 'Cannot move {} because it already exists in the ' + 'destination'.format(file) + ) + shutil.move(os.path.join(src_dir, file), dest_file) + except LinstorVolumeManagerError: + raise + except Exception as e: + raise LinstorVolumeManagerError( + 'Cannot move {}: {}'.format(file, e) + ) + except Exception as e: + if not force: + try: + cls._move_files(dest_dir, src_dir, force=True) + except Exception: + pass + + raise LinstorVolumeManagerError( + 'Failed to move files from {} to {}: {}'.format( + src_dir, dest_dir, e + ) + ) + + @staticmethod + def _get_filtered_properties(properties): + return dict(properties.items()) + + @staticmethod + def _filter_errors(result): + return [ + err for err in result + if hasattr(err, 'is_error') and err.is_error() + ] + + @staticmethod + def _check_errors(result, codes): + for err in result: + for code in codes: + if err.is_error(code): + return True + return False + + @classmethod + def _controller_is_running(cls): + return cls._service_is_running('linstor-controller') + + @classmethod + def _start_controller(cls, start=True): + return cls._start_service('linstor-controller', start) + + @staticmethod + def _start_service(name, start=True): + action = 'start' if start else 'stop' + (ret, out, err) = util.doexec([ + 'systemctl', action, name + ]) + if ret != 0: + raise LinstorVolumeManagerError( + 'Failed to {} {}: {} {}' + .format(action, name, out, err) + ) + + @staticmethod + def _service_is_running(name): + (ret, out, err) = util.doexec([ + 'systemctl', 'is-active', '--quiet', name + ]) + return not ret + + @staticmethod + def _is_mounted(mountpoint): + (ret, out, err) = util.doexec(['mountpoint', '-q', mountpoint]) + return ret == 0 + + @classmethod + def _mount_volume(cls, volume_path, mountpoint, mount=True): + if mount: + try: + util.pread(['mount', volume_path, mountpoint]) + except Exception as e: + raise LinstorVolumeManagerError( + 'Failed to mount volume {} on {}: {}' + .format(volume_path, mountpoint, e) + ) + else: + try: + if cls._is_mounted(mountpoint): + util.pread(['umount', mountpoint]) + except Exception as e: + raise LinstorVolumeManagerError( + 'Failed to umount volume {} on {}: {}' + .format(volume_path, mountpoint, e) + ) + + +# ============================================================================== + +# Check if a path is a DRBD resource and log the process name/pid +# that opened it. +def log_drbd_openers(path): + # Ignore if it's not a symlink to DRBD resource. + if not path.startswith(DRBD_BY_RES_PATH): + return + + # Compute resource name. + res_name_end = path.find('/', len(DRBD_BY_RES_PATH)) + if res_name_end == -1: + return + res_name = path[len(DRBD_BY_RES_PATH):res_name_end] + + volume_end = path.rfind('/') + if volume_end == res_name_end: + return + volume = path[volume_end + 1:] + + try: + # Ensure path is a DRBD. + drbd_path = os.path.realpath(path) + stats = os.stat(drbd_path) + if not stat.S_ISBLK(stats.st_mode) or os.major(stats.st_rdev) != 147: + return + + # Find where the device is open. + (ret, stdout, stderr) = util.doexec(['drbdadm', 'status', res_name]) + if ret != 0: + util.SMlog('Failed to execute `drbdadm status` on `{}`: {}'.format( + res_name, stderr + )) + return + + # Is it a local device? + if stdout.startswith('{} role:Primary'.format(res_name)): + util.SMlog( + 'DRBD resource `{}` is open on local host: {}' + .format(path, get_local_volume_openers(res_name, volume)) + ) + return + + # Is it a remote device? + util.SMlog( + 'DRBD resource `{}` is open on hosts: {}' + .format(path, get_all_volume_openers(res_name, volume)) + ) + except Exception as e: + util.SMlog( + 'Got exception while trying to determine where DRBD resource ' + + '`{}` is open: {}'.format(path, e) + ) diff --git a/drivers/nfs.py b/drivers/nfs.py index a40b8eda..c06cfc4c 100644 --- a/drivers/nfs.py +++ b/drivers/nfs.py @@ -202,7 +202,11 @@ def scan_exports(target): textnode = dom.createTextNode(target) subentry.appendChild(textnode) - (path, access) = val.split() + # Access is not always provided by showmount return + # If none is provided we need to assume "*" + array = val.split() + path = array[0] + access = array[1] if len(array) >= 2 else "*" subentry = dom.createElement("Path") entry.appendChild(subentry) textnode = dom.createTextNode(path) diff --git a/drivers/on_slave.py b/drivers/on_slave.py index 0d60d969..7b8c55d9 100755 --- a/drivers/on_slave.py +++ b/drivers/on_slave.py @@ -72,7 +72,18 @@ def multi(session, args): def _is_open(session, args): """Check if VDI is open by a tapdisk on this host""" - import SRCommand, SR, NFSSR, EXTSR, LVHDSR, blktap2 + import SRCommand + import SR + import CephFSSR + import EXTSR + import GlusterFSSR + import LinstorSR + import LVHDSR + import MooseFSSR + import NFSSR + import XFSSR + import ZFSSR + import blktap2 util.SMlog("on-slave.is_open: %s" % args) vdiUuid = args["vdiUuid"] @@ -86,11 +97,39 @@ def _is_open(session, args): srType = "lvhd" cmd = SRCommand.SRCommand(None) cmd.driver_info = {"capabilities": None} - cmd.dconf = {"server": None, "device": "/HACK"} + cmd.dconf = { + "server": None, + "device": "/HACK", + # Hack for custom XCP-ng drivers. + "masterhost": None, # MooseFS + "rootpath": None, # MooseFS + "serverpath": None, # CephFS + "location": "/HACK" # ZFS + } cmd.params = {"command": None} + sr_uuid = srRec["uuid"] + + # Another ugly piece of code to load a real Linstor SR, otherwise + # we can't fetch the VDI path. + if srType == 'linstor': + host_ref = util.get_this_host_ref(session) + sr_ref = session.xenapi.SR.get_by_uuid(sr_uuid) + + pbd = util.find_my_pbd(session, host_ref, sr_ref) + if pbd is None: + raise util.SMException('Failed to find Linstor PBD') + + cmd.dconf = session.xenapi.PBD.get_device_config(pbd) + driver = SR.driver(srType) - sr = driver(cmd, srRec["uuid"]) + sr = driver(cmd, sr_uuid) + + # session_ref param is required to have a valid session when SR object is created. + # It's not the case here, so attach the current session object to make LinstorSR happy. + if srType == 'linstor': + sr.session = session + vdi = sr.vdi(vdiUuid) tapdisk = blktap2.Tapdisk.find_by_path(vdi.path) util.SMlog("Tapdisk for %s: %s" % (vdi.path, tapdisk)) diff --git a/drivers/tapdisk-pause b/drivers/tapdisk-pause index 59368696..e0bca7be 100755 --- a/drivers/tapdisk-pause +++ b/drivers/tapdisk-pause @@ -29,6 +29,12 @@ import lvhdutil import vhdutil import lvmcache +try: + from linstorvolumemanager import get_controller_uri, LinstorVolumeManager + LINSTOR_AVAILABLE = True +except ImportError: + LINSTOR_AVAILABLE = False + TAPDEV_BACKPATH_PFX = "/dev/sm/backend" TAPDEV_PHYPATH_PFX = "/dev/sm/phy" @@ -73,24 +79,6 @@ def _mkphylink(sr_uuid, vdi_uuid, path): util.pread2(cmd) return path -def _pathRefresh(): - # LVM rename check - realpath = os.path.realpath(self.phypath) - phypath = vdi_type = None - util.SMlog("Realpath: %s" % realpath) - if realpath.startswith("/dev/VG_XenStorage-") and \ - not os.path.exists(realpath): - util.SMlog("Path inconsistent") - pfx = "/dev/VG_XenStorage-%s/" % self.sr_uuid - for ty in ["LV","VHD"]: - p = pfx + ty + "-" + self.vdi_uuid - util.SMlog("Testing path: %s" % p) - if os.path.exists(p): - _mkphylink(self.sr_uuid, self.vdi_uuid, p) - phypath = p - if ty == "LV": vdi_type = "aio" - else: vdi_type = "vhd" - def tapPause(session, args): tap = Tapdisk(session, args) return tap.Pause() @@ -148,7 +136,47 @@ class Tapdisk: self.realpath = p if ty == "LV": self.vdi_type = "aio" else: self.vdi_type = "vhd" - + elif realpath.startswith('/dev/drbd/by-res/xcp-volume-'): + if not LINSTOR_AVAILABLE: + raise util.SMException( + 'Can\'t refresh tapdisk: LINSTOR libraries are missing' + ) + + # We must always recreate the symlink to ensure we have + # the right info. Why? Because if the volume UUID is changed in + # LINSTOR the symlink is not directly updated. When live leaf + # coalesce is executed we have these steps: + # "A" -> "OLD_A" + # "B" -> "A" + # Without symlink update the previous "A" path is reused instead of + # "B" path. Note: "A", "B" and "OLD_A" are UUIDs. + session = self.session + + host_ref = util.get_this_host_ref(session) + sr_ref = session.xenapi.SR.get_by_uuid(self.sr_uuid) + + pbd = util.find_my_pbd(session, host_ref, sr_ref) + if pbd is None: + raise util.SMException('Failed to find PBD') + + dconf = session.xenapi.PBD.get_device_config(pbd) + group_name = dconf['group-name'] + + device_path = LinstorVolumeManager( + get_controller_uri(), + group_name, + logger=util.SMlog + ).get_device_path(self.vdi_uuid) + + if realpath != device_path: + util.SMlog( + 'Update LINSTOR PhyLink (previous={}, current={})' + .format(realpath, device_path) + ) + os.unlink(self.phypath) + _mkphylink(self.sr_uuid, self.vdi_uuid, device_path) + self.realpath = device_path + @locking("VDIUnavailable") def Pause(self): util.SMlog("Pause for %s" % self.vdi_uuid) diff --git a/drivers/util.py b/drivers/util.py index 2353f4b2..2ccfe3d6 100755 --- a/drivers/util.py +++ b/drivers/util.py @@ -651,10 +651,15 @@ def get_this_host(): f.close() return uuid -def is_master(session): + +def get_master_ref(session): pools = session.xenapi.pool.get_all() - master = session.xenapi.pool.get_master(pools[0]) - return get_this_host_ref(session) == master + return session.xenapi.pool.get_master(pools[0]) + + +def is_master(session): + return get_this_host_ref(session) == get_master_ref(session) + # XXX: this function doesn't do what it claims to do def get_localhost_uuid(session): @@ -695,6 +700,18 @@ def get_hosts_attached_on(session, vdi_uuids): host_refs[key[len('host_'):]] = True return host_refs.keys() +def get_this_host_address(session): + host_uuid = get_this_host() + host_ref = session.xenapi.host.get_by_uuid(host_uuid) + return session.xenapi.host.get_record(host_ref)['address'] + +def get_host_addresses(session): + addresses = [] + hosts = session.xenapi.host.get_all_records() + for record in hosts.itervalues(): + addresses.append(record['address']) + return addresses + def get_this_host_ref(session): host_uuid = get_this_host() host_ref = session.xenapi.host.get_by_uuid(host_uuid) @@ -823,10 +840,9 @@ def handler(signum, frame): signal.signal(signal.SIGALRM, handler) signal.alarm(timeoutseconds) try: - function(*arguments) - except: + return function(*arguments) + finally: signal.alarm(0) - raise def _incr_iscsiSR_refcount(targetIQN, uuid): @@ -1384,13 +1400,21 @@ def findRunningProcessOrOpenFile(name, process = True): else: return (retVal, processandpids) -def retry(f, maxretry=20, period=3): +def retry(f, maxretry=20, period=3, exceptions=[Exception]): retries = 0 while True: try: return f() - except Exception, e: - SMlog("Got exception: %s. Retry number: %s" % (str(e),retries)) + except Exception as e: + for exception in exceptions: + if isinstance(e, exception): + SMlog('Got exception: {}. Retry number: {}'.format( + str(e), retries + )) + break + else: + SMlog('Got bad exception: {}. Raising...'.format(e)) + raise e retries += 1 if retries >= maxretry: @@ -1784,3 +1808,94 @@ def sessions_less_than_targets(other_config, device_config): else: return False + +def enable_and_start_service(name, start): + attempt = 0 + while True: + attempt += 1 + fn = 'enable' if start else 'disable' + args = ('systemctl', fn, '--now', name) + (ret, out, err) = doexec(args) + if ret == 0: + return + elif attempt >= 3: + raise Exception( + 'Failed to {} {}: {} {}'.format(fn, name, out, err) + ) + time.sleep(1) + + +def stop_service(name): + args = ('systemctl', 'stop', name) + (ret, out, err) = doexec(args) + if ret == 0: + return + raise Exception('Failed to stop {}: {} {}'.format(name, out, err)) + + +def restart_service(name): + attempt = 0 + while True: + attempt += 1 + SMlog('Restarting service {} {}...'.format(name, attempt)) + args = ('systemctl', 'restart', name) + (ret, out, err) = doexec(args) + if ret == 0: + return + elif attempt >= 3: + SMlog('Restart service FAILED {} {}'.format(name, attempt)) + raise Exception( + 'Failed to restart {}: {} {}'.format(name, out, err) + ) + time.sleep(1) + + +def check_pid_exists(pid): + try: + os.kill(pid, 0) + except OSError: + return False + else: + return True + + +def make_profile(name, function): + """ + Helper to execute cProfile using unique log file. + """ + + import cProfile + import itertools + import os.path + import time + + assert name + assert function + + FOLDER = '/tmp/sm-perfs/' + makedirs(FOLDER) + + filename = time.strftime('{}_%Y%m%d_%H%M%S.prof'.format(name)) + + def gen_path(path): + yield path + root, ext = os.path.splitext(path) + for i in itertools.count(start=1, step=1): + yield root + '.{}.'.format(i) + ext + + for profile_path in gen_path(FOLDER + filename): + try: + file = open_atomic(profile_path, 'w') + file.close() + break + except OSError as e: + if e.errno == errno.EEXIST: + pass + else: + raise + + try: + SMlog('* Start profiling of {} ({}) *'.format(name, filename)) + cProfile.runctx('function()', None, locals(), profile_path) + finally: + SMlog('* End profiling of {} ({}) *'.format(name, filename)) diff --git a/drivers/vhdutil.py b/drivers/vhdutil.py index 422834eb..48337f87 100755 --- a/drivers/vhdutil.py +++ b/drivers/vhdutil.py @@ -97,17 +97,19 @@ def calcOverheadFull(virtual_size): def fullSizeVHD(virtual_size): return virtual_size + calcOverheadFull(virtual_size) -def ioretry(cmd): - return util.ioretry(lambda: util.pread2(cmd), - errlist = [errno.EIO, errno.EAGAIN]) +def ioretry(cmd, errlist=[errno.EIO, errno.EAGAIN]): + return util.ioretry(lambda: util.pread2(cmd), errlist) -def getVHDInfo(path, extractUuidFunction, includeParent = True): +def getVHDInfo(path, extractUuidFunction, includeParent=True, resolveParent=True): """Get the VHD info. The parent info may optionally be omitted: vhd-util tries to verify the parent by opening it, which results in error if the VHD resides on an inactive LV""" opts = "-vsf" if includeParent: opts += "p" + if not resolveParent: + opts += "u" + cmd = [VHD_UTIL, "query", OPT_LOG_ERR, opts, "-n", path] ret = ioretry(cmd) fields = ret.strip().split('\n') diff --git a/etc/systemd/system/drbd-reactor.service.d/override.conf b/etc/systemd/system/drbd-reactor.service.d/override.conf new file mode 100644 index 00000000..c079ab62 --- /dev/null +++ b/etc/systemd/system/drbd-reactor.service.d/override.conf @@ -0,0 +1,7 @@ +[Unit] +StartLimitInterval=60 +StartLimitBurst=10 + +[Service] +Restart=always +RestartSec=2 diff --git a/etc/systemd/system/linstor-satellite.service.d/override.conf b/etc/systemd/system/linstor-satellite.service.d/override.conf new file mode 100644 index 00000000..b1686b4f --- /dev/null +++ b/etc/systemd/system/linstor-satellite.service.d/override.conf @@ -0,0 +1,5 @@ +[Service] +Environment=LS_KEEP_RES=^xcp-persistent* + +[Unit] +After=drbd.service diff --git a/etc/systemd/system/var-lib-linstor.service b/etc/systemd/system/var-lib-linstor.service new file mode 100644 index 00000000..e9deb904 --- /dev/null +++ b/etc/systemd/system/var-lib-linstor.service @@ -0,0 +1,21 @@ +# Regarding the current version of systemd (v.219) used in XCP-ng, we can't use +# the ReadWriteOnly option (to apply the -w flag, it's not the same than -o rw). +# This file is a workaround to avoid RO. It must be replaced with the code below +# in a mount unit. Compatible with version >= 246. +# +# [Unit] +# Description=Filesystem for the LINSTOR controller +# +# [Mount] +# What=/dev/drbd/by-res/xcp-persistent-database/0 +# Where=/var/lib/linstor +# ReadWriteOnly=true + +[Unit] +Description=Mount filesystem for the LINSTOR controller + +[Service] +Type=oneshot +ExecStart=/bin/mount -w /dev/drbd/by-res/xcp-persistent-database/0 /var/lib/linstor +ExecStop=/opt/xensource/libexec/safe-umount /var/lib/linstor +RemainAfterExit=true diff --git a/linstor/Makefile b/linstor/Makefile new file mode 100644 index 00000000..c329ca30 --- /dev/null +++ b/linstor/Makefile @@ -0,0 +1,22 @@ +PREFIX ?= /opt/xensource/libexec +DESTDIR ?= +DEBUGDIR ?= /opt/xensource/debug + + +OPTS := -Wall -std=gnu99 + +SRC := linstor-monitord.c + +BIN := linstor-monitord + +all: daemon + +daemon: linstor-monitord.c + $(CC) $(OPTS) $(SRC) -o $(BIN) + +install: linstor-monitord + mkdir -p $(DESTDIR)$(PREFIX) + install -m 755 $^ $(DESTDIR)$(PREFIX) + +clean: + rm -f linstor-monitord diff --git a/linstor/linstor-monitord.c b/linstor/linstor-monitord.c new file mode 100644 index 00000000..47740598 --- /dev/null +++ b/linstor/linstor-monitord.c @@ -0,0 +1,535 @@ +/* + * Copyright (C) 2020 Vates SAS - ronan.abhamon@vates.fr + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// TODO: Handle new hosts. +// TODO: https://github.com/xcp-ng/xcp/issues/421 + +// ============================================================================= + +#define POOL_CONF_DIR "/etc/xensource" +#define POOL_CONF_FILE "pool.conf" +#define POOL_CONF_ABS_FILE POOL_CONF_DIR "/" POOL_CONF_FILE + +// In milliseconds. +#define UPDATE_LINSTOR_NODE_TIMEOUT 2000 +#define SR_SCAN_TIMEOUT 720000 + +// ----------------------------------------------------------------------------- + +static inline void normalizeTime (struct timespec *spec) { + while (spec->tv_nsec >= 1000000000) { + ++spec->tv_sec; + spec->tv_nsec -= 1000000000; + } + while (spec->tv_nsec < 0) { + --spec->tv_sec; + spec->tv_nsec += 1000000000; + } +} + +static inline struct timespec getCurrentTime () { + struct timespec spec; + clock_gettime(CLOCK_MONOTONIC, &spec); + return (struct timespec){ + .tv_sec = spec.tv_sec, + .tv_nsec = spec.tv_nsec + }; +} + +static inline struct timespec getTimeDiff (const struct timespec *a, const struct timespec *b) { + struct timespec result = *a; + result.tv_sec -= b->tv_sec - 1; + result.tv_nsec -= b->tv_nsec + 1000000000; + normalizeTime(&result); + return result; +} + +static inline int64_t convertToMilliseconds (struct timespec spec) { + spec.tv_nsec += 1000 - spec.tv_nsec % 1000; + normalizeTime(&spec); + return spec.tv_sec * 1000 + spec.tv_nsec / 1000000; +} + +// ----------------------------------------------------------------------------- + +static inline int readPoolConf (char *buffer, size_t bufferSize) { + FILE *f = fopen(POOL_CONF_ABS_FILE, "r"); + if (!f) { + syslog(LOG_ERR, "Failed to open `" POOL_CONF_ABS_FILE "`: `%s`.", strerror(errno)); + return -errno; + } + + int ret = 0; + if (!fgets(buffer, bufferSize, f)) { + syslog(LOG_ERR, "Cannot read `" POOL_CONF_ABS_FILE "`."); + ret = -EIO; + } + + fclose(f); + + return ret; +} + +static inline int isMasterHost (int *error) { + if (error) + *error = 0; + + char buffer[512]; + + int ret = readPoolConf(buffer, sizeof buffer); + if (ret < 0) { + if (error) + *error = ret; + return 0; + } + + static const char masterStr[] = "master"; + static const size_t masterLen = sizeof masterStr - 1; + if (!strncmp(buffer, masterStr, masterLen)) { + const char end = buffer[masterLen]; + ret = end == '\0' || isspace(end); + } + + if (ret < 0) { + if (error) + *error = ret; + return 0; + } + + return ret; +} + +// ----------------------------------------------------------------------------- + +typedef struct { + int inotifyFd; + struct timespec lastScanTime; + int isMaster; + // TODO: Should be completed with at least a hostname field. +} State; + +// ----------------------------------------------------------------------------- + +typedef struct { + char *data; + size_t size; + size_t capacity; +} Buffer; + +#define max(a, b) ({ \ + __typeof__(a) _a = (a); \ + __typeof__(b) _b = (b); \ + _a > _b ? _a : _b; \ +}) + +static inline ssize_t readAll (int fd, Buffer *buffer) { + assert(buffer->capacity >= buffer->size); + + ssize_t ret = 0; + do { + size_t byteCount = buffer->capacity - buffer->size; + if (byteCount < 16) { + const size_t newCapacity = max(buffer->capacity << 1, 64); + char *p = realloc(buffer->data, newCapacity); + if (!p) + return -errno; + + buffer->data = p; + buffer->capacity = newCapacity; + + byteCount = buffer->capacity - buffer->size; + } + + ret = read(fd, buffer->data + buffer->size, byteCount); + if (ret > 0) + buffer->size += ret; + else if (ret < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) + ret = 0; + } while (ret > 0); + + return ret; +} + +// ----------------------------------------------------------------------------- + +static inline int execCommand (char *argv[], Buffer *buffer) { + int pipefd[2]; + if (buffer) { + if (pipe(pipefd) < 0) { + syslog(LOG_ERR, "Failed to exec pipe: `%s`.", strerror(errno)); + return -errno; + } + + if (fcntl(pipefd[0], F_SETFL, O_NONBLOCK) < 0) { + syslog(LOG_ERR, "Failed to exec fcntl on pipe in: `%s`.", strerror(errno)); + close(pipefd[0]); + close(pipefd[1]); + return -errno; + } + } + + const pid_t pid = fork(); + if (pid < 0) { + syslog(LOG_ERR, "Failed to fork: `%s`.", strerror(errno)); + if (buffer) { + close(pipefd[0]); + close(pipefd[1]); + } + return -errno; + } + + // Child process. + if (pid == 0) { + if (buffer) { + close(STDOUT_FILENO); + dup(pipefd[1]); + + close(pipefd[0]); + close(pipefd[1]); + } + + if (execvp(*argv, argv) < 0) + syslog(LOG_ERR, "Failed to exec `%s` command.", *argv); + exit(EXIT_FAILURE); + } + + // Main process. + int ret = 0; + if (buffer) { + close(pipefd[1]); + + do { + struct pollfd fds = { pipefd[0], POLLIN | POLLHUP, 0 }; + const int res = poll(&fds, 1, 0); + if (res < 0) { + if (errno == EAGAIN) + continue; + syslog(LOG_ERR, "Failed to poll from command: `%s`.", strerror(errno)); + ret = -errno; + } else if (res > 0) { + if (fds.revents & POLLIN) + ret = readAll(pipefd[0], buffer); + if (fds.revents & POLLHUP) + break; // Input has been closed. + } + } while (ret >= 0); + + close(pipefd[0]); + } + + int status; + if (waitpid(pid, &status, 0) < 0) { + syslog(LOG_ERR, "Failed to wait command: `%s`.", *argv); + return -errno; + } + + if (WIFEXITED(status)) { + const int code = WEXITSTATUS(status); + if (code == 0) + syslog(LOG_INFO, "`%s` completed normally.", *argv); + else + syslog(LOG_ERR, "`%s` exited with an error: %d.", *argv, code); + } else if (WIFSIGNALED(status)) + syslog(LOG_ERR, "`%s` terminated by signal %d.", *argv, WTERMSIG(status)); + + return ret; +} + +// ----------------------------------------------------------------------------- + +static inline int createInotifyInstance () { + const int fd = inotify_init1(IN_CLOEXEC); + if (fd < 0) { + syslog(LOG_ERR, "Unable to create inotify instance: `%s`.", strerror(errno)); + return -errno; + } + return fd; +} + +static inline int addInotifyWatch (int inotifyFd, const char *filepath, uint32_t mask) { + const int wd = inotify_add_watch(inotifyFd, filepath, mask); + if (wd < 0) { + syslog(LOG_ERR, "Unable to register `%s`: `%s`.", filepath, strerror(errno)); + return -errno; + } + return wd; +} + +// ----------------------------------------------------------------------------- + +static inline int updateLinstorNode (State *state) { + char buffer[256]; + if (gethostname(buffer, sizeof buffer) == -1) { + syslog(LOG_ERR, "Failed to get hostname: `%s`.", strerror(errno)); + return errno ? -errno : -EINVAL; + } + + // TODO: Finish me, see: https://github.com/xcp-ng/xcp/issues/421 + + return 0; +} + +// ----------------------------------------------------------------------------- + +#define UUID_PARAM "uuid=" +#define UUID_PARAM_LEN (sizeof(UUID_PARAM) - 1) +#define UUID_LENGTH 36 + +static inline void scanLinstorSr (const char *uuid) { + char uuidBuf[UUID_LENGTH + UUID_PARAM_LEN + 1] = UUID_PARAM; + strncpy(uuidBuf + UUID_PARAM_LEN, uuid, UUID_LENGTH); + uuidBuf[UUID_LENGTH + UUID_PARAM_LEN] = '\0'; + execCommand((char *[]){ "xe", "sr-scan", uuidBuf, NULL }, NULL); +} + +// Called to update the physical/virtual size used by LINSTOR SRs in XAPI DB. +static inline int scanLinstorSrs () { + Buffer srs = {}; + const int ret = execCommand((char *[]){ "xe", "sr-list", "type=linstor", "--minimal", NULL }, &srs); + if (ret) { + free(srs.data); + return ret; + } + + const char *end = srs.data + srs.size; + char *pos = srs.data; + for (char *off; (off = memchr(pos, ',', end - pos)); pos = off + 1) + if (off - pos == UUID_LENGTH) + scanLinstorSr(pos); + + if (end - pos >= UUID_LENGTH) { + for (--end; end - pos >= UUID_LENGTH && isspace(*end); --end) {} + if (isalnum(*end)) + scanLinstorSr(pos); + } + + free(srs.data); + + return 0; +} + +// ----------------------------------------------------------------------------- + +#define PROCESS_MODE_DEFAULT 0 +#define PROCESS_MODE_WAIT_FILE_CREATION 1 + +static inline int waitForPoolConfCreation (State *state, int *wdFile); + +static inline int processPoolConfEvents (State *state, int wd, char **buffer, size_t *bufferSize, int mode, int *process) { + size_t size = 0; + if (ioctl(state->inotifyFd, FIONREAD, (char *)&size) == -1) { + syslog(LOG_ERR, "Failed to get buffer size from inotify descriptor: `%s`.", strerror(errno)); + return -errno; + } + + if (*bufferSize < size) { + void *ptr = realloc(*buffer, size); + if (!ptr) { + syslog(LOG_ERR, "Failed to reallocate buffer with size %zu: `%s`.", size, strerror(errno)); + return -errno; + } + *buffer = ptr; + *bufferSize = size; + } + + if ((size = (size_t)read(state->inotifyFd, *buffer, size)) == (size_t)-1) { + syslog(LOG_ERR, "Failed to read buffer from inotify descriptor: `%s`.", strerror(errno)); + return -errno; + } + + uint32_t mask = 0; + for (char *p = *buffer, *end = p + size; p < end; ) { + const struct inotify_event *event = (struct inotify_event *)p; + + if (event->mask & IN_Q_OVERFLOW) + syslog(LOG_WARNING, "Event queue overflow."); + + if (event->wd == wd) { + if (event->len) { + // Event in the watched directory. + if (!strncmp(event->name, POOL_CONF_FILE, event->len)) + mask |= event->mask; + } else { + // Directory or watched file event. + if (mode == PROCESS_MODE_DEFAULT) + mask |= event->mask; + else if (event->mask & (IN_DELETE_SELF | IN_MOVE_SELF | IN_UNMOUNT)) { + syslog(LOG_ERR, "Watched `" POOL_CONF_DIR "` dir has been removed!"); + return -EIO; // The process should be exited after that. + } + } + } + + p += sizeof(struct inotify_event) + event->len; + } + + int ret = 0; + if (mode == PROCESS_MODE_DEFAULT) { + if (!mask) + return 0; + + syslog(LOG_INFO, "Updating linstor services... (Inotify mask=%" PRIu32 ")", mask); + if (mask & (IN_DELETE_SELF | IN_MOVE_SELF | IN_UNMOUNT)) { + syslog(LOG_ERR, "Watched `" POOL_CONF_ABS_FILE "` file has been removed!"); + inotify_rm_watch(state->inotifyFd, wd); // Do not forget to remove watch to avoid leaks. + return -EIO; + } + } else { + if (mask & (IN_CREATE | IN_MOVED_TO)) { + syslog(LOG_ERR, "Watched `" POOL_CONF_ABS_FILE "` file has been recreated!"); + *process = 0; + } + } + + return ret; +} + +static inline int waitAndProcessEvents (State *state, int wd, int mode) { + char *buffer = NULL; + size_t bufferSize = 0; + + int ret = 0; + int process = 1; + + struct timespec previousTime = getCurrentTime(); + do { + const struct timespec currentTime = getCurrentTime(); + const int64_t elapsedTime = convertToMilliseconds(getTimeDiff(¤tTime, &previousTime)); + + int timeout; + if (elapsedTime >= UPDATE_LINSTOR_NODE_TIMEOUT) { + updateLinstorNode(state); + timeout = UPDATE_LINSTOR_NODE_TIMEOUT; + previousTime = getCurrentTime(); + } else { + timeout = UPDATE_LINSTOR_NODE_TIMEOUT - elapsedTime; + } + + const int64_t elapsedScanTime = convertToMilliseconds(getTimeDiff(¤tTime, &state->lastScanTime)); + if (elapsedScanTime >= SR_SCAN_TIMEOUT) { + state->isMaster = isMasterHost(&ret); + if (state->isMaster) + scanLinstorSrs(); + state->lastScanTime = getCurrentTime(); + } + + struct pollfd fds = { state->inotifyFd, POLLIN, 0 }; + const int res = poll(&fds, 1, timeout); + if (res < 0) { + if (errno == EAGAIN) + continue; + syslog(LOG_ERR, "Failed to poll from inotify descriptor: `%s`.", strerror(errno)); + ret = -errno; + } else if (res > 0) { + state->isMaster = isMasterHost(&ret); + if (!ret) + ret = processPoolConfEvents(state, wd, &buffer, &bufferSize, mode, &process); + } + } while (ret >= 0 && process); + + free(buffer); + return ret; +} + +static inline int waitAndProcessFileEvents (State *state, int wd) { + return waitAndProcessEvents(state, wd, PROCESS_MODE_DEFAULT); +} + +static inline int waitAndProcessDirEvents (State *state, int wd) { + return waitAndProcessEvents(state, wd, PROCESS_MODE_WAIT_FILE_CREATION); +} + +static inline int waitForPoolConfCreation (State *state, int *wdFile) { + const int wdDir = addInotifyWatch( + state->inotifyFd, POOL_CONF_DIR, IN_MOVED_TO | IN_CREATE | IN_MOVE_SELF | IN_DELETE_SELF + ); + if (wdDir < 0) + return wdDir; + + int ret = 0; + do { + do { + // Update LINSTOR services... + int ret; + state->isMaster = isMasterHost(&ret); + + // Ok we can't read the pool configuration file. + // Maybe the file doesn't exist. Waiting its creation... + } while ((ret == -ENOENT || ret == -EIO) && !(ret = waitAndProcessDirEvents(state, wdDir))); + + // The services have been updated, now we must add a new watch on the pool config file directly. + if (!ret) { + *wdFile = addInotifyWatch(state->inotifyFd, POOL_CONF_ABS_FILE, IN_MODIFY | IN_MOVE_SELF | IN_DELETE_SELF); + if (*wdFile < 0) + ret = *wdFile; + } + } while (ret == -ENOENT); + + inotify_rm_watch(state->inotifyFd, wdDir); + return ret; +} + +// ----------------------------------------------------------------------------- + +int main (int argc, char *argv[]) { + (void)argc; + (void)argv; + + openlog(argv[0], LOG_PID, LOG_USER | LOG_MAIL); + setlogmask(LOG_UPTO(LOG_INFO)); + + State state = { + .inotifyFd = -1, + .lastScanTime = getCurrentTime(), + .isMaster = 0 + }; + + const int inotifyFd = createInotifyInstance(); + if (inotifyFd < 0) + return -inotifyFd; + state.inotifyFd = inotifyFd; + + updateLinstorNode(&state); + + int ret = 0; + while (!ret || ret == -ENOENT || ret == -EIO) { + int wdFile; + if ((ret = waitForPoolConfCreation(&state, &wdFile)) < 0) + break; // If the pool config dir cannot be watched or accessed, we consider it is a fatal error. + + ret = waitAndProcessFileEvents(&state, wdFile); + } + + close(inotifyFd); + return -ret; +} diff --git a/multipath/custom.conf b/multipath/custom.conf new file mode 100644 index 00000000..3c8583f1 --- /dev/null +++ b/multipath/custom.conf @@ -0,0 +1,6 @@ +# Custom configuration for multipathd + +# Changes made to this file will not be overwritten by future system updates. +# They will also be retained through system upgrades to newer releases. + +# Refer to "man multipath.conf" diff --git a/multipath/multipath.conf b/multipath/multipath.conf index aaf45e58..3de11441 100644 --- a/multipath/multipath.conf +++ b/multipath/multipath.conf @@ -1,3 +1,11 @@ +# --- WARNING: DO NOT EDIT THIS FILE --- +# The contents of this file may be overwritten at any future time through a +# system update, causing any custom configuration to be lost. +# +# For custom multipath configuration, create a separate .conf file in the +# /etc/multipath/conf.d/ directory. +# --- END OF WARNING --- + # This configuration file is used to overwrite the built-in configuration of # multipathd. # For information on the syntax refer to `man multipath.conf` and the examples @@ -103,4 +111,11 @@ devices { prio alua uid_attribute ID_SERIAL } + device { + vendor "TrueNAS" + product "iSCSI Disk" + hardware_handler "1 alua" + path_grouping_policy group_by_prio + prio alua + } } diff --git a/scripts/fork-log-daemon b/scripts/fork-log-daemon new file mode 100755 index 00000000..665a60ba --- /dev/null +++ b/scripts/fork-log-daemon @@ -0,0 +1,36 @@ +#!/usr/bin/env python + +import select +import signal +import subprocess +import sys +import syslog + +def main(): + process = subprocess.Popen(sys.argv[1:], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + signal.signal(signal.SIGTERM, signal.SIG_IGN) + write_to_stdout = True + + while process.poll() is None: + while True: + output = process.stdout.readline() + if not output: + break + + if write_to_stdout: + try: + print(output) + sys.stdout.flush() + except Exception: + # Probably a broken pipe. So the process reading stdout is dead. + write_to_stdout = False + syslog.syslog(output) + +if __name__ == "__main__": + syslog.openlog(ident=sys.argv[1], facility=syslog.LOG_DAEMON) + try: + main() + except Exception as e: + syslog.syslog(sys.argv[1] + ' terminated with exception: {}'.format(e)) + finally: + syslog.syslog(sys.argv[1] + ' is now terminated!') diff --git a/scripts/linstor-kv-tool b/scripts/linstor-kv-tool new file mode 100755 index 00000000..c9070270 --- /dev/null +++ b/scripts/linstor-kv-tool @@ -0,0 +1,78 @@ +#!/usr/bin/env python +# +# Copyright (C) 2022 Vates SAS +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import argparse +import json +import linstor + + +def dump_kv(controller_uri, group_name, namespace): + kv = linstor.KV( + group_name, + uri=controller_uri, + namespace=namespace + ) + print(json.dumps(kv, sort_keys=True, indent=2)) + + +def remove_volume(controller_uri, group_name, vdi_name): + assert vdi_name + kv = linstor.KV( + group_name, + uri=controller_uri, + namespace='/xcp/volume/{}'.format(vdi_name) + ) + + for key, value in list(kv.items()): + del kv[key] + + +def remove_all_volumes(controller_uri, group_name): + kv = linstor.KV( + group_name, + uri=controller_uri, + namespace='/' + ) + + for key, value in list(kv.items()): + if key.startswith('xcp/volume/') or key.startswith('xcp/sr/journal/'): + size = key.rindex('/') + kv.namespace = key[:size] + del kv[key[size + 1:]] + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('-u', '--uri', required=True) + parser.add_argument('-g', '--group-name', required=True) + parser.add_argument('-n', '--namespace', default='/') + + action = parser.add_mutually_exclusive_group(required=True) + action.add_argument('--dump-volumes', action='store_true') + action.add_argument('--remove-volume', metavar='VDI_UUID') + action.add_argument('--remove-all-volumes', action='store_true') + + args = parser.parse_args() + if args.dump_volumes: + dump_kv(args.uri, args.group_name, args.namespace) + elif args.remove_volume: + remove_volume(args.uri, args.group_name, args.remove_volume) + elif args.remove_all_volumes: + remove_all_volumes(args.uri, args.group_name) + + +if __name__ == '__main__': + main() diff --git a/scripts/safe-umount b/scripts/safe-umount new file mode 100755 index 00000000..9c1dcc40 --- /dev/null +++ b/scripts/safe-umount @@ -0,0 +1,39 @@ +#!/usr/bin/env python2 + +import argparse +import subprocess +import sys +import time + + +def safe_umount(path): + retry_count = 10 + not_mounted_str = 'umount: {}: not mounted'.format(path) + + last_code = 0 + while retry_count: + proc = subprocess.Popen(['mountpoint', '-q', path]) + proc.wait() + if proc.returncode: + return 0 + + proc = subprocess.Popen(['umount', path], stderr=subprocess.PIPE) + (stdout, stderr) = proc.communicate() + if not proc.returncode: + return 0 + + error = stderr.strip() + if error == not_mounted_str: + return 0 + + retry_count -= 1 + last_code = proc.returncode + time.sleep(0.500) + return last_code + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('path') + args = parser.parse_args() + sys.exit(safe_umount(args.path)) diff --git a/systemd/linstor-monitor.service b/systemd/linstor-monitor.service new file mode 100644 index 00000000..5f8f0a76 --- /dev/null +++ b/systemd/linstor-monitor.service @@ -0,0 +1,13 @@ +[Unit] +Description=LINSTOR Monitor +Before=xs-sm.service +ConditionPathExists=/usr/share/linstor-server/bin/Controller + +[Service] +StandardOutput=null +StandardError=journal +ExecStart=/opt/xensource/libexec/linstor-monitord +Restart=always + +[Install] +WantedBy=multi-user.target diff --git a/systemd/xs-sm.service b/systemd/xs-sm.service index 99cb313f..609c6ef5 100644 --- a/systemd/xs-sm.service +++ b/systemd/xs-sm.service @@ -1,5 +1,5 @@ [Unit] -Description=XenServer Storage Manager (SM) +Description=XCP-ng Storage Manager (SM) Before=xapi.service Conflicts=shutdown.target RefuseManualStop=yes diff --git a/tests/mocks/linstor/__init__.py b/tests/mocks/linstor/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_ISOSR.py b/tests/test_ISOSR.py index 914b9616..a5b32903 100644 --- a/tests/test_ISOSR.py +++ b/tests/test_ISOSR.py @@ -19,6 +19,65 @@ def __init__(self, srcmd, none): self.dconf = srcmd.dconf self.srcmd = srcmd +class TestISOSR_overLocal(unittest.TestCase): + def create_isosr(self, location='/local_sr', sr_uuid='asr_uuid'): + srcmd = mock.Mock() + srcmd.dconf = { + 'location': location, + 'type': 'iso', + 'legacy_mode': True + } + srcmd.params = { + 'command': 'some_command' + } + isosr = FakeISOSR(srcmd, None) + isosr.load(sr_uuid) + return isosr + + @mock.patch('util.pread') + def test_load(self, pread): + self.create_isosr() + # Check `mount/umount` is never called. + self.assertFalse(pread.called) + + @mock.patch('os.path.exists', autospec=True) + @mock.patch('util.pread') + def test_attach_and_detach_local(self, pread, exists): + isosr = self.create_isosr() + isosr.attach(None) + self.assertFalse(pread.called) + isosr.detach(None) + self.assertFalse(pread.called) + + @mock.patch('os.path.exists', autospec=True) + @mock.patch('util.pread') + @mock.patch('ISOSR.ISOSR._checkmount') + def test_attach_and_detach_local_with_mounted_path( + self, _checkmount, pread, exists + ): + _checkmount.return_value = True + + isosr = self.create_isosr() + isosr.attach(None) + self.assertFalse(pread.called) + isosr.detach(None) + self.assertFalse(pread.called) + + @testlib.with_context + @mock.patch('os.path.exists') + @mock.patch('util.pread') + def test_attach_local_with_bad_path(self, context, pread, exists): + context.setup_error_codes() + + # Local path doesn't exist, but error list yes. + exists.side_effect = [False, True] + + isosr = self.create_isosr() + with self.assertRaises(SR.SROSError) as ose: + isosr.attach(None) + self.assertEquals(ose.exception.errno, 226) + self.assertFalse(pread.called) + class TestISOSR_overNFS(unittest.TestCase): diff --git a/tests/test_MooseFSSR.py b/tests/test_MooseFSSR.py new file mode 100644 index 00000000..5a61cf5e --- /dev/null +++ b/tests/test_MooseFSSR.py @@ -0,0 +1,62 @@ +import mock +import MooseFSSR +import unittest + + +class FakeMooseFSSR(MooseFSSR.MooseFSSR): + uuid = None + sr_ref = None + srcmd = None + other_config = {} + + def __init__(self, srcmd, none): + self.dconf = srcmd.dconf + self.srcmd = srcmd + + +class TestMooseFSSR(unittest.TestCase): + + def create_moosefssr(self, masterhost='aServer', rootpath='/aServerpath', + sr_uuid='asr_uuid'): + srcmd = mock.Mock() + srcmd.dconf = { + 'masterhost': masterhost, + 'rootpath': rootpath + } + srcmd.params = { + 'command': 'some_command', + 'device_config': {} + } + moosefssr = FakeMooseFSSR(srcmd, None) + moosefssr.load(sr_uuid) + return moosefssr + + @mock.patch('MooseFSSR.MooseFSSR._is_moosefs_available', mock.MagicMock(return_value="mfsmount")) + @mock.patch('MooseFSSR.Lock', autospec=True) + def test_load(self, Lock): + self.create_moosefssr() + + @mock.patch('MooseFSSR.MooseFSSR._is_moosefs_available', mock.MagicMock(return_value="mfsmount")) + @mock.patch('MooseFSSR.MooseFSSR.checkmount', autospec=True) + @mock.patch('MooseFSSR.Lock', autospec=True) + def test_attach_if_mounted_then_attached(self, mock_lock, mock_checkmount): + mfssr = self.create_moosefssr() + mock_checkmount.return_value=True + mfssr.attach('asr_uuid') + self.assertTrue(mfssr.attached) + + @mock.patch('MooseFSSR.MooseFSSR._is_moosefs_available', mock.MagicMock(return_value="mfsmount")) + @mock.patch('MooseFSSR.Lock', autospec=True) + def test_mount_mountpoint_empty_string(self, mock_lock): + mfssr = self.create_moosefssr() + self.assertRaises(MooseFSSR.MooseFSException, mfssr.mount) + + @mock.patch('MooseFSSR.MooseFSSR._is_moosefs_available', mock.MagicMock(return_value="mfsmount")) + @mock.patch('MooseFSSR.MooseFSSR.checkmount',return_value=False, autospec=True) + @mock.patch('MooseFSSR.Lock', autospec=True) + def test_detach_not_detached_if_not_mounted(self, mock_lock, mock_checkmount): + mfssr = self.create_moosefssr() + mfssr.attached = True + mock_checkmount.return_value=False + mfssr.detach('asr_uuid') + self.assertTrue(mfssr.attached) diff --git a/tests/test_ZFSSR.py b/tests/test_ZFSSR.py new file mode 100644 index 00000000..6f8040dc --- /dev/null +++ b/tests/test_ZFSSR.py @@ -0,0 +1,115 @@ +import FileSR +import mock +import os +import SR +import unittest +import ZFSSR + + +XML_DEFS = os.path.dirname(os.path.abspath(__file__)) + \ + '/../drivers/XE_SR_ERRORCODES.xml' + + +class FakeZFSSR(ZFSSR.ZFSSR): + uuid = None + sr_ref = None + session = None + srcmd = None + other_config = {} + vdis = {} + passthrough = True + + def __init__(self, srcmd, none): + self.dconf = srcmd.dconf + self.srcmd = srcmd + + +class TestZFSSR(unittest.TestCase): + def create_zfs_sr(self, sr_uuid='asr_uuid', location='fake_path'): + srcmd = mock.Mock() + srcmd.dconf = { + 'location': location + } + srcmd.params = { + 'command': 'some_command', + 'device_config': {} + } + sr = FakeZFSSR(srcmd, None) + sr.load(sr_uuid) + return sr + + @mock.patch('ZFSSR.is_zfs_available', autospec=True) + @mock.patch('FileSR.Lock', autospec=True) + def test_load(self, lock, is_zfs_available): + self.create_zfs_sr() + + @mock.patch('xs_errors.XML_DEFS', new=XML_DEFS) + def test_load_with_zfs_unavailable(self): + failed = False + try: + self.create_zfs_sr() + except SR.SROSError as e: + # Check SRUnavailable error. + self.assertTrue(e.errno == 47) + failed = True + self.assertTrue(failed) + + @mock.patch('ZFSSR.is_zfs_available', autospec=True) + @mock.patch('ZFSSR.is_zfs_path', autospec=True) + @mock.patch('FileSR.Lock', autospec=True) + def test_create(self, lock, is_zfs_path, is_zfs_available): + sr = self.create_zfs_sr() + sr.create(sr.uuid, 42) + + @mock.patch('ZFSSR.is_zfs_available', autospec=True) + @mock.patch('ZFSSR.is_zfs_path', autospec=True) + @mock.patch('FileSR.Lock', autospec=True) + @mock.patch('xs_errors.XML_DEFS', new=XML_DEFS) + def test_create_with_invalid_zfs_path( + self, lock, is_zfs_path, is_zfs_available + ): + failed = False + + is_zfs_path.return_value = False + sr = self.create_zfs_sr() + try: + sr.create(sr.uuid, 42) + except SR.SROSError as e: + # Check ZFSSRCreate error. + self.assertTrue(e.errno == 5000) + failed = True + self.assertTrue(failed) + + @mock.patch('ZFSSR.is_zfs_available', autospec=True) + @mock.patch('ZFSSR.is_zfs_path', autospec=True) + @mock.patch('FileSR.Lock', autospec=True) + @mock.patch('FileSR.FileSR._checkmount', autospec=True) + @mock.patch('FileSR.FileSR._loadvdis', autospec=True) + @mock.patch('SR.SR.scan', autospec=True) + @mock.patch('os.path.ismount', autospec=True) + def test_scan( + self, ismount, scan, _loadvdis, _checkmount, lock, + is_zfs_path, is_zfs_available + ): + sr = self.create_zfs_sr() + sr.scan(sr.uuid) + + @mock.patch('ZFSSR.is_zfs_available', autospec=True) + @mock.patch('ZFSSR.is_zfs_path', autospec=True) + @mock.patch('FileSR.Lock', autospec=True) + @mock.patch('FileSR.FileSR._checkmount', autospec=True) + @mock.patch('xs_errors.XML_DEFS', new=XML_DEFS) + def test_scan_with_invalid_zfs_path( + self, _checkmount, lock, is_zfs_path, is_zfs_available + ): + failed = False + + is_zfs_path.return_value = False + sr = self.create_zfs_sr() + try: + sr.scan(sr.uuid) + except SR.SROSError as e: + # Check SRUnavailable error. + self.assertTrue(e.errno == 47) + failed = True + self.assertTrue(failed) diff --git a/tests/test_nfs.py b/tests/test_nfs.py index 71800ab0..cef414fe 100644 --- a/tests/test_nfs.py +++ b/tests/test_nfs.py @@ -140,3 +140,33 @@ def test_validate_nfsversion_valid(self): for thenfsversion in ['3', '4', '4.1']: self.assertEquals(nfs.validate_nfsversion(thenfsversion), thenfsversion) + + # Can't use autospec due to http://bugs.python.org/issue17826 + @mock.patch('util.pread2') + def test_scan_exports(self, pread2): + pread2.side_effect = ["/srv/nfs\n/srv/nfs2 *\n/srv/nfs3 127.0.0.1/24"] + res = nfs.scan_exports('aServer') + + expected = """ + +\t +\t\taServer +\t\t/srv/nfs +\t\t* +\t +\t +\t\taServer +\t\t/srv/nfs2 +\t\t* +\t +\t +\t\taServer +\t\t/srv/nfs3 +\t\t127.0.0.1/24 +\t + +""" + + self.assertEqual(res.toprettyxml(), expected) + self.assertEqual(len(pread2.mock_calls), 1) + pread2.assert_called_with(['/usr/sbin/showmount', '--no-headers', '-e', 'aServer']) diff --git a/tests/test_on_slave.py b/tests/test_on_slave.py index 54ebcd38..4c12d903 100644 --- a/tests/test_on_slave.py +++ b/tests/test_on_slave.py @@ -13,7 +13,15 @@ class Test_on_slave_is_open(unittest.TestCase): - MOCK_IMPORTS = ['SRCommand', 'SR', 'NFSSR', 'EXTSR', 'LVHDSR', 'blktap2'] + MOCK_IMPORTS = [ + 'SRCommand', + 'SR', + 'NFSSR', + 'EXTSR', + 'LVHDSR', + 'LinstorSR', + 'blktap2' + ] def fake_import(self, name, *args): print 'Asked to import {}'.format(name)