From b7d3ea782aafcc6a5a7dcab7242e7da2895c84b2 Mon Sep 17 00:00:00 2001 From: Mark Syms Date: Thu, 20 May 2021 17:40:06 +0100 Subject: [PATCH 001/133] backport of ccd121cc248d79b749a63d4ad099e6d5f4b8b588: CA-354692: check for device parameter in create/probe calls Signed-off-by: Mark Syms Signed-off-by: Ronan Abhamon --- drivers/EXTSR.py | 3 +++ drivers/LVHDSR.py | 4 +++- drivers/SR.py | 9 +++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/EXTSR.py b/drivers/EXTSR.py index 4caa09df..da43082e 100755 --- a/drivers/EXTSR.py +++ b/drivers/EXTSR.py @@ -18,6 +18,7 @@ # EXTSR: Based on local-file storage repository, mounts ext3 partition import SR, SRCommand, FileSR, util, lvutil, scsiutil +from SR import deviceCheck import os import xs_errors @@ -141,10 +142,12 @@ def detach(self, sr_uuid): raise xs_errors.XenError('LVMUnMount', \ opterr='lvm -an failed errno is %d' % inst.code) + @deviceCheck def probe(self): return lvutil.srlist_toxml(lvutil.scan_srlist(EXT_PREFIX, self.dconf['device']), EXT_PREFIX) + @deviceCheck def create(self, sr_uuid, size): if self._checkmount(): raise xs_errors.XenError('SRExists') diff --git a/drivers/LVHDSR.py b/drivers/LVHDSR.py index cfee0717..dd8e20b9 100755 --- a/drivers/LVHDSR.py +++ b/drivers/LVHDSR.py @@ -19,6 +19,7 @@ # import SR +from SR import deviceCheck import VDI import SRCommand import util @@ -494,7 +495,7 @@ def _expand_size(self): util.logException("LVHDSR._expand_size for %s failed to resize" " the PV" % self.uuid) - + @deviceCheck def create(self, uuid, size): util.SMlog("LVHDSR.create for %s" % self.uuid) if not self.isMaster: @@ -849,6 +850,7 @@ def _updateStats(self, uuid, virtAllocDelta): self.physical_utilisation = stats['physical_utilisation'] self._db_update() + @deviceCheck def probe(self): return lvutil.srlist_toxml( lvutil.scan_srlist(lvhdutil.VG_PREFIX, self.dconf['device']), diff --git a/drivers/SR.py b/drivers/SR.py index 8cb516d3..be693d6b 100755 --- a/drivers/SR.py +++ b/drivers/SR.py @@ -49,6 +49,15 @@ def __init__(self, errno, reason): self.errno = errno Exception.__init__(self, reason) + +def deviceCheck(op): + def wrapper(self, *args): + if 'device' not in self.dconf: + raise xs_errors.XenError('ConfigDeviceMissing') + return op(self, *args) + return wrapper + + backends = [] def registerSR(SRClass): """Register SR with handler. All SR subclasses should call this in From 3a0c67d5fe6ac43f07f8633c79180d475bbb0cea Mon Sep 17 00:00:00 2001 From: Samuel Verschelde Date: Thu, 13 Aug 2020 15:22:17 +0200 Subject: [PATCH 002/133] Update xs-sm.service's description for XCP-ng This was a patch added to the sm RPM git repo before we had this forked git repo for sm in the xcp-ng github organisation. --- systemd/xs-sm.service | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/systemd/xs-sm.service b/systemd/xs-sm.service index 99cb313f..609c6ef5 100644 --- a/systemd/xs-sm.service +++ b/systemd/xs-sm.service @@ -1,5 +1,5 @@ [Unit] -Description=XenServer Storage Manager (SM) +Description=XCP-ng Storage Manager (SM) Before=xapi.service Conflicts=shutdown.target RefuseManualStop=yes From a8168e152d830f4d0f7fd085d52eab2eceab6f0a Mon Sep 17 00:00:00 2001 From: Samuel Verschelde Date: Thu, 13 Aug 2020 15:26:43 +0200 Subject: [PATCH 003/133] Add TrueNAS multipath config This was a patch added to the sm RPM git repo before we had this forked git repo for sm in the xcp-ng github organisation. --- multipath/multipath.conf | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/multipath/multipath.conf b/multipath/multipath.conf index aaf45e58..1073faa0 100644 --- a/multipath/multipath.conf +++ b/multipath/multipath.conf @@ -103,4 +103,11 @@ devices { prio alua uid_attribute ID_SERIAL } + device { + vendor "TrueNAS" + product "iSCSI Disk" + hardware_handler "1 alua" + path_grouping_policy group_by_prio + prio alua + } } From 6120e7f0373d206ee6195404852c3d5aed0bebdd Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Mon, 20 Jul 2020 16:26:42 +0200 Subject: [PATCH 004/133] feat(drivers): add CephFS, GlusterFS and XFS drivers --- Makefile | 3 + drivers/CephFSSR.py | 296 +++++++++++++++++++++++++++++++++++++++++ drivers/GlusterFSSR.py | 287 +++++++++++++++++++++++++++++++++++++++ drivers/XFSSR.py | 242 +++++++++++++++++++++++++++++++++ drivers/cleanup.py | 5 +- 5 files changed, 832 insertions(+), 1 deletion(-) create mode 100644 drivers/CephFSSR.py create mode 100644 drivers/GlusterFSSR.py create mode 100644 drivers/XFSSR.py diff --git a/Makefile b/Makefile index d07a8ca3..0a10470a 100755 --- a/Makefile +++ b/Makefile @@ -17,6 +17,9 @@ SM_DRIVERS += OCFSoHBA SM_DRIVERS += SHM SM_DRIVERS += SMB SM_DRIVERS += LVHDoFCoE +SM_DRIVERS += CephFS +SM_DRIVERS += GlusterFS +SM_DRIVERS += XFS SM_LIBS := SR SM_LIBS += SRCommand diff --git a/drivers/CephFSSR.py b/drivers/CephFSSR.py new file mode 100644 index 00000000..d7974907 --- /dev/null +++ b/drivers/CephFSSR.py @@ -0,0 +1,296 @@ +#!/usr/bin/env python +# +# Original work copyright (C) Citrix systems +# Modified work copyright (C) Vates SAS and XCP-ng community +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published +# by the Free Software Foundation; version 2.1 only. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +# +# CEPHFSSR: Based on FileSR, mounts ceph fs share + +import errno +import os +import syslog as _syslog +import xmlrpclib +from syslog import syslog + +# careful with the import order here +# FileSR has a circular dependency: +# FileSR -> blktap2 -> lvutil -> EXTSR -> FileSR +# importing in this order seems to avoid triggering the issue. +import SR +import SRCommand +import FileSR +# end of careful +import cleanup +import util +import vhdutil +import xs_errors +from lock import Lock + +CAPABILITIES = ["SR_PROBE", "SR_UPDATE", "SR_CACHING", + "VDI_CREATE", "VDI_DELETE", "VDI_ATTACH", "VDI_DETACH", + "VDI_UPDATE", "VDI_CLONE", "VDI_SNAPSHOT", "VDI_RESIZE", "VDI_MIRROR", + "VDI_GENERATE_CONFIG", + "VDI_RESET_ON_BOOT/2", "ATOMIC_PAUSE"] + +CONFIGURATION = [ + ['server', 'Ceph server(s) (required, ex: "192.168.0.12" or "10.10.10.10,10.10.10.26")'], + ['serverpath', 'Ceph FS path (required, ex: "/")'], + ['serverport', 'ex: 6789'], + ['options', 'Ceph FS client name, and secretfile (required, ex: "name=admin,secretfile=/etc/ceph/admin.secret")'] +] + +DRIVER_INFO = { + 'name': 'CephFS VHD', + 'description': 'SR plugin which stores disks as VHD files on a CephFS storage', + 'vendor': 'Vates SAS', + 'copyright': '(C) 2020 Vates SAS', + 'driver_version': '1.0', + 'required_api_version': '1.0', + 'capabilities': CAPABILITIES, + 'configuration': CONFIGURATION +} + +DRIVER_CONFIG = {"ATTACH_FROM_CONFIG_WITH_TAPDISK": True} + +# The mountpoint for the directory when performing an sr_probe. All probes +# are guaranteed to be serialised by xapi, so this single mountpoint is fine. +PROBE_MOUNTPOINT = os.path.join(SR.MOUNT_BASE, "probe") + + +class CephFSException(Exception): + def __init__(self, errstr): + self.errstr = errstr + + +# mountpoint = /var/run/sr-mount/CephFS/uuid +# linkpath = mountpoint/uuid - path to SR directory on share +# path = /var/run/sr-mount/uuid - symlink to SR directory on share +class CephFSSR(FileSR.FileSR): + """Ceph file-based storage repository""" + + DRIVER_TYPE = 'cephfs' + + def handles(sr_type): + # fudge, because the parent class (FileSR) checks for smb to alter its behavior + return sr_type == CephFSSR.DRIVER_TYPE or sr_type == 'smb' + + handles = staticmethod(handles) + + def load(self, sr_uuid): + if not self._is_ceph_available(): + raise xs_errors.XenError( + 'SRUnavailable', + opterr='ceph is not installed' + ) + + self.ops_exclusive = FileSR.OPS_EXCLUSIVE + self.lock = Lock(vhdutil.LOCK_TYPE_SR, self.uuid) + self.sr_vditype = SR.DEFAULT_TAP + self.driver_config = DRIVER_CONFIG + if 'server' not in self.dconf: + raise xs_errors.XenError('ConfigServerMissing') + self.remoteserver = self.dconf['server'] + self.remotepath = self.dconf['serverpath'] + # if serverport is not specified, use default 6789 + if 'serverport' not in self.dconf: + self.remoteport = "6789" + else: + self.remoteport = self.dconf['serverport'] + if self.sr_ref and self.session is not None: + self.sm_config = self.session.xenapi.SR.get_sm_config(self.sr_ref) + else: + self.sm_config = self.srcmd.params.get('sr_sm_config') or {} + self.mountpoint = os.path.join(SR.MOUNT_BASE, 'CephFS', sr_uuid) + self.linkpath = os.path.join(self.mountpoint, sr_uuid or "") + self.path = os.path.join(SR.MOUNT_BASE, sr_uuid) + self._check_o_direct() + + def checkmount(self): + return util.ioretry(lambda: ((util.pathexists(self.mountpoint) and + util.ismount(self.mountpoint)) and + util.pathexists(self.path))) + + def mount(self, mountpoint=None): + """Mount the remote ceph export at 'mountpoint'""" + if mountpoint is None: + mountpoint = self.mountpoint + elif not util.is_string(mountpoint) or mountpoint == "": + raise CephFSException("mountpoint not a string object") + + try: + if not util.ioretry(lambda: util.isdir(mountpoint)): + util.ioretry(lambda: util.makedirs(mountpoint)) + except util.CommandException, inst: + raise CephFSException("Failed to make directory: code is %d" % inst.code) + + try: + options = [] + if self.dconf.has_key('options'): + options.append(self.dconf['options']) + if options: + options = ['-o', ','.join(options)] + command = ["mount", '-t', 'ceph', self.remoteserver+":"+self.remoteport+":"+self.remotepath, mountpoint] + options + util.ioretry(lambda: util.pread(command), errlist=[errno.EPIPE, errno.EIO], maxretry=2, nofail=True) + except util.CommandException, inst: + syslog(_syslog.LOG_ERR, 'CephFS mount failed ' + inst.__str__()) + raise CephFSException("mount failed with return code %d" % inst.code) + + # Sanity check to ensure that the user has at least RO access to the + # mounted share. Windows sharing and security settings can be tricky. + try: + util.listdir(mountpoint) + except util.CommandException: + try: + self.unmount(mountpoint, True) + except CephFSException: + util.logException('CephFSSR.unmount()') + raise CephFSException("Permission denied. Please check user privileges.") + + def unmount(self, mountpoint, rmmountpoint): + try: + util.pread(["umount", mountpoint]) + except util.CommandException, inst: + raise CephFSException("umount failed with return code %d" % inst.code) + if rmmountpoint: + try: + os.rmdir(mountpoint) + except OSError, inst: + raise CephFSException("rmdir failed with error '%s'" % inst.strerror) + + def attach(self, sr_uuid): + if not self.checkmount(): + try: + self.mount() + os.symlink(self.linkpath, self.path) + except CephFSException, exc: + raise SR.SROSError(12, exc.errstr) + self.attached = True + + def probe(self): + try: + self.mount(PROBE_MOUNTPOINT) + sr_list = filter(util.match_uuid, util.listdir(PROBE_MOUNTPOINT)) + self.unmount(PROBE_MOUNTPOINT, True) + except (util.CommandException, xs_errors.XenError): + raise + # Create a dictionary from the SR uuids to feed SRtoXML() + sr_dict = {sr_uuid: {} for sr_uuid in sr_list} + return util.SRtoXML(sr_dict) + + def detach(self, sr_uuid): + if not self.checkmount(): + return + util.SMlog("Aborting GC/coalesce") + cleanup.abort(self.uuid) + # Change directory to avoid unmount conflicts + os.chdir(SR.MOUNT_BASE) + self.unmount(self.mountpoint, True) + os.unlink(self.path) + self.attached = False + + def create(self, sr_uuid, size): + if self.checkmount(): + raise SR.SROSError(113, 'CephFS mount point already attached') + + try: + self.mount() + except CephFSException, exc: + # noinspection PyBroadException + try: + os.rmdir(self.mountpoint) + except: + # we have no recovery strategy + pass + raise SR.SROSError(111, "CephFS mount error [opterr=%s]" % exc.errstr) + + if util.ioretry(lambda: util.pathexists(self.linkpath)): + if len(util.ioretry(lambda: util.listdir(self.linkpath))) != 0: + self.detach(sr_uuid) + raise xs_errors.XenError('SRExists') + else: + try: + util.ioretry(lambda: util.makedirs(self.linkpath)) + os.symlink(self.linkpath, self.path) + except util.CommandException, inst: + if inst.code != errno.EEXIST: + try: + self.unmount(self.mountpoint, True) + except CephFSException: + util.logException('CephFSSR.unmount()') + raise SR.SROSError(116, + "Failed to create CephFS SR. remote directory creation error: {}".format( + os.strerror(inst.code))) + self.detach(sr_uuid) + + def delete(self, sr_uuid): + # try to remove/delete non VDI contents first + super(CephFSSR, self).delete(sr_uuid) + try: + if self.checkmount(): + self.detach(sr_uuid) + self.mount() + if util.ioretry(lambda: util.pathexists(self.linkpath)): + util.ioretry(lambda: os.rmdir(self.linkpath)) + util.SMlog(str(self.unmount(self.mountpoint, True))) + except util.CommandException, inst: + self.detach(sr_uuid) + if inst.code != errno.ENOENT: + raise SR.SROSError(114, "Failed to remove CephFS mount point") + + def vdi(self, uuid, loadLocked=False): + return CephFSFileVDI(self, uuid) + + @staticmethod + def _is_ceph_available(): + import distutils.spawn + return distutils.spawn.find_executable('ceph') + +class CephFSFileVDI(FileSR.FileVDI): + def attach(self, sr_uuid, vdi_uuid): + if not hasattr(self, 'xenstore_data'): + self.xenstore_data = {} + + self.xenstore_data['storage-type'] = CephFSSR.DRIVER_TYPE + + return super(CephFSFileVDI, self).attach(sr_uuid, vdi_uuid) + + def generate_config(self, sr_uuid, vdi_uuid): + util.SMlog("SMBFileVDI.generate_config") + if not util.pathexists(self.path): + raise xs_errors.XenError('VDIUnavailable') + resp = {'device_config': self.sr.dconf, + 'sr_uuid': sr_uuid, + 'vdi_uuid': vdi_uuid, + 'sr_sm_config': self.sr.sm_config, + 'command': 'vdi_attach_from_config'} + # Return the 'config' encoded within a normal XMLRPC response so that + # we can use the regular response/error parsing code. + config = xmlrpclib.dumps(tuple([resp]), "vdi_attach_from_config") + return xmlrpclib.dumps((config,), "", True) + + def attach_from_config(self, sr_uuid, vdi_uuid): + try: + if not util.pathexists(self.sr.path): + self.sr.attach(sr_uuid) + except: + util.logException("SMBFileVDI.attach_from_config") + raise xs_errors.XenError('SRUnavailable', + opterr='Unable to attach from config') + + +if __name__ == '__main__': + SRCommand.run(CephFSSR, DRIVER_INFO) +else: + SR.registerSR(CephFSSR) diff --git a/drivers/GlusterFSSR.py b/drivers/GlusterFSSR.py new file mode 100644 index 00000000..a2f7484f --- /dev/null +++ b/drivers/GlusterFSSR.py @@ -0,0 +1,287 @@ +#!/usr/bin/env python +# +# Original work copyright (C) Citrix systems +# Modified work copyright (C) Vates SAS and XCP-ng community +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published +# by the Free Software Foundation; version 2.1 only. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +import errno +import os +import syslog as _syslog +import xmlrpclib +from syslog import syslog + +# careful with the import order here +# FileSR has a circular dependency: FileSR- > blktap2 -> lvutil -> EXTSR -> FileSR +# importing in this order seems to avoid triggering the issue. +import SR +import SRCommand +import FileSR +# end of careful +import cleanup +import util +import vhdutil +import xs_errors +from lock import Lock + +CAPABILITIES = ["SR_PROBE", "SR_UPDATE", "SR_CACHING", + "VDI_CREATE", "VDI_DELETE", "VDI_ATTACH", "VDI_DETACH", + "VDI_UPDATE", "VDI_CLONE", "VDI_SNAPSHOT", "VDI_RESIZE", "VDI_MIRROR", + "VDI_GENERATE_CONFIG", + "VDI_RESET_ON_BOOT/2", "ATOMIC_PAUSE"] + +CONFIGURATION = [['server', 'Full path to share on gluster server (required, ex: "192.168.0.12:/gv0")'], + ['backupservers', 'list of servers separated by ":"'], + ['fetchattempts', 'number of attempts to fetch files before switching to the backup server'] + ] + +DRIVER_INFO = { + 'name': 'GlusterFS VHD', + 'description': 'SR plugin which stores disks as VHD files on a GlusterFS storage', + 'vendor': 'Vates SAS', + 'copyright': '(C) 2020 Vates SAS', + 'driver_version': '1.0', + 'required_api_version': '1.0', + 'capabilities': CAPABILITIES, + 'configuration': CONFIGURATION +} + +DRIVER_CONFIG = {"ATTACH_FROM_CONFIG_WITH_TAPDISK": True} + +# The mountpoint for the directory when performing an sr_probe. All probes +# are guaranteed to be serialised by xapi, so this single mountpoint is fine. +PROBE_MOUNTPOINT = os.path.join(SR.MOUNT_BASE, "probe") + + +class GlusterFSException(Exception): + def __init__(self, errstr): + self.errstr = errstr + + +# mountpoint = /var/run/sr-mount/GlusterFS//uuid +# linkpath = mountpoint/uuid - path to SR directory on share +# path = /var/run/sr-mount/uuid - symlink to SR directory on share +class GlusterFSSR(FileSR.FileSR): + """Gluster file-based storage repository""" + + DRIVER_TYPE = 'glusterfs' + + def handles(sr_type): + # fudge, because the parent class (FileSR) checks for smb to alter its behavior + return sr_type == GlusterFSSR.DRIVER_TYPE or sr_type == 'smb' + + handles = staticmethod(handles) + + def load(self, sr_uuid): + if not self._is_glusterfs_available(): + raise xs_errors.XenError( + 'SRUnavailable', + opterr='glusterfs is not installed' + ) + + self.ops_exclusive = FileSR.OPS_EXCLUSIVE + self.lock = Lock(vhdutil.LOCK_TYPE_SR, self.uuid) + self.sr_vditype = SR.DEFAULT_TAP + self.driver_config = DRIVER_CONFIG + if 'server' not in self.dconf: + raise xs_errors.XenError('ConfigServerMissing') + self.remoteserver = self.dconf['server'] + if self.sr_ref and self.session is not None: + self.sm_config = self.session.xenapi.SR.get_sm_config(self.sr_ref) + else: + self.sm_config = self.srcmd.params.get('sr_sm_config') or {} + self.mountpoint = os.path.join(SR.MOUNT_BASE, 'GlusterFS', self.remoteserver.split(':')[0], sr_uuid) + self.linkpath = os.path.join(self.mountpoint, sr_uuid or "") + self.path = os.path.join(SR.MOUNT_BASE, sr_uuid) + self._check_o_direct() + + def checkmount(self): + return util.ioretry(lambda: ((util.pathexists(self.mountpoint) and + util.ismount(self.mountpoint)) and + util.pathexists(self.linkpath))) + + def mount(self, mountpoint=None): + """Mount the remote gluster export at 'mountpoint'""" + if mountpoint is None: + mountpoint = self.mountpoint + elif not util.is_string(mountpoint) or mountpoint == "": + raise GlusterFSException("mountpoint not a string object") + + try: + if not util.ioretry(lambda: util.isdir(mountpoint)): + util.ioretry(lambda: util.makedirs(mountpoint)) + except util.CommandException, inst: + raise GlusterFSException("Failed to make directory: code is %d" % inst.code) + try: + options = [] + if 'backupservers' in self.dconf: + options.append('backup-volfile-servers=' + self.dconf['backupservers']) + if 'fetchattempts' in self.dconf: + options.append('fetch-attempts=' + self.dconf['fetchattempts']) + if options: + options = ['-o', ','.join(options)] + command = ["mount", '-t', 'glusterfs', self.remoteserver, mountpoint] + options + util.ioretry(lambda: util.pread(command), errlist=[errno.EPIPE, errno.EIO], maxretry=2, nofail=True) + except util.CommandException, inst: + syslog(_syslog.LOG_ERR, 'GlusterFS mount failed ' + inst.__str__()) + raise GlusterFSException("mount failed with return code %d" % inst.code) + + # Sanity check to ensure that the user has at least RO access to the + # mounted share. Windows sharing and security settings can be tricky. + try: + util.listdir(mountpoint) + except util.CommandException: + try: + self.unmount(mountpoint, True) + except GlusterFSException: + util.logException('GlusterFSSR.unmount()') + raise GlusterFSException("Permission denied. Please check user privileges.") + + def unmount(self, mountpoint, rmmountpoint): + try: + util.pread(["umount", mountpoint]) + except util.CommandException, inst: + raise GlusterFSException("umount failed with return code %d" % inst.code) + if rmmountpoint: + try: + os.rmdir(mountpoint) + except OSError, inst: + raise GlusterFSException("rmdir failed with error '%s'" % inst.strerror) + + def attach(self, sr_uuid): + if not self.checkmount(): + try: + self.mount() + os.symlink(self.linkpath, self.path) + except GlusterFSException, exc: + raise SR.SROSError(12, exc.errstr) + self.attached = True + + def probe(self): + try: + self.mount(PROBE_MOUNTPOINT) + sr_list = filter(util.match_uuid, util.listdir(PROBE_MOUNTPOINT)) + self.unmount(PROBE_MOUNTPOINT, True) + except (util.CommandException, xs_errors.XenError): + raise + # Create a dictionary from the SR uuids to feed SRtoXML() + sr_dict = {sr_uuid: {} for sr_uuid in sr_list} + return util.SRtoXML(sr_dict) + + def detach(self, sr_uuid): + if not self.checkmount(): + return + util.SMlog("Aborting GC/coalesce") + cleanup.abort(self.uuid) + # Change directory to avoid unmount conflicts + os.chdir(SR.MOUNT_BASE) + self.unmount(self.mountpoint, True) + os.unlink(self.path) + self.attached = False + + def create(self, sr_uuid, size): + if self.checkmount(): + raise SR.SROSError(113, 'GlusterFS mount point already attached') + + try: + self.mount() + except GlusterFSException, exc: + # noinspection PyBroadException + try: + os.rmdir(self.mountpoint) + except: + # we have no recovery strategy + pass + raise SR.SROSError(111, "GlusterFS mount error [opterr=%s]" % exc.errstr) + + if util.ioretry(lambda: util.pathexists(self.linkpath)): + if len(util.ioretry(lambda: util.listdir(self.linkpath))) != 0: + self.detach(sr_uuid) + raise xs_errors.XenError('SRExists') + else: + try: + util.ioretry(lambda: util.makedirs(self.linkpath)) + os.symlink(self.linkpath, self.path) + except util.CommandException, inst: + if inst.code != errno.EEXIST: + try: + self.unmount(self.mountpoint, True) + except GlusterFSException: + util.logException('GlusterFSSR.unmount()') + raise SR.SROSError(116, + "Failed to create GlusterFS SR. remote directory creation error: {}".format( + os.strerror(inst.code))) + self.detach(sr_uuid) + + def delete(self, sr_uuid): + # try to remove/delete non VDI contents first + super(GlusterFSSR, self).delete(sr_uuid) + try: + if self.checkmount(): + self.detach(sr_uuid) + self.mount() + if util.ioretry(lambda: util.pathexists(self.linkpath)): + util.ioretry(lambda: os.rmdir(self.linkpath)) + self.unmount(self.mountpoint, True) + except util.CommandException, inst: + self.detach(sr_uuid) + if inst.code != errno.ENOENT: + raise SR.SROSError(114, "Failed to remove GlusterFS mount point") + + def vdi(self, uuid, loadLocked=False): + return GlusterFSFileVDI(self, uuid) + + @staticmethod + def _is_glusterfs_available(): + import distutils.spawn + return distutils.spawn.find_executable('glusterfs') + + +class GlusterFSFileVDI(FileSR.FileVDI): + def attach(self, sr_uuid, vdi_uuid): + if not hasattr(self, 'xenstore_data'): + self.xenstore_data = {} + + self.xenstore_data['storage-type'] = GlusterFSSR.DRIVER_TYPE + + return super(GlusterFSFileVDI, self).attach(sr_uuid, vdi_uuid) + + def generate_config(self, sr_uuid, vdi_uuid): + util.SMlog("SMBFileVDI.generate_config") + if not util.pathexists(self.path): + raise xs_errors.XenError('VDIUnavailable') + resp = {'device_config': self.sr.dconf, + 'sr_uuid': sr_uuid, + 'vdi_uuid': vdi_uuid, + 'sr_sm_config': self.sr.sm_config, + 'command': 'vdi_attach_from_config'} + # Return the 'config' encoded within a normal XMLRPC response so that + # we can use the regular response/error parsing code. + config = xmlrpclib.dumps(tuple([resp]), "vdi_attach_from_config") + return xmlrpclib.dumps((config,), "", True) + + def attach_from_config(self, sr_uuid, vdi_uuid): + try: + if not util.pathexists(self.sr.path): + self.sr.attach(sr_uuid) + except: + util.logException("SMBFileVDI.attach_from_config") + raise xs_errors.XenError('SRUnavailable', + opterr='Unable to attach from config') + + +if __name__ == '__main__': + SRCommand.run(GlusterFSSR, DRIVER_INFO) +else: + SR.registerSR(GlusterFSSR) diff --git a/drivers/XFSSR.py b/drivers/XFSSR.py new file mode 100644 index 00000000..de35d738 --- /dev/null +++ b/drivers/XFSSR.py @@ -0,0 +1,242 @@ +#!/usr/bin/env python +# +# Original work copyright (C) Citrix systems +# Modified work copyright (C) Vates SAS and XCP-ng community +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published +# by the Free Software Foundation; version 2.1 only. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +# +# XFSSR: Based on local-file storage repository, mounts xfs partition + +import SR, SRCommand, FileSR, util, lvutil, scsiutil + +import os +import xs_errors +import vhdutil +from lock import Lock +from constants import EXT_PREFIX + +CAPABILITIES = ["SR_PROBE","SR_UPDATE", "SR_SUPPORTS_LOCAL_CACHING", \ + "VDI_CREATE","VDI_DELETE","VDI_ATTACH","VDI_DETACH", \ + "VDI_UPDATE","VDI_CLONE","VDI_SNAPSHOT","VDI_RESIZE","VDI_MIRROR", \ + "VDI_GENERATE_CONFIG", \ + "VDI_RESET_ON_BOOT/2","ATOMIC_PAUSE", "VDI_CONFIG_CBT", + "VDI_ACTIVATE", "VDI_DEACTIVATE"] + +CONFIGURATION = [ [ 'device', 'local device path (required) (e.g. /dev/sda3)' ] ] + +DRIVER_INFO = { + 'name': 'Local XFS VHD', + 'description': 'SR plugin which represents disks as VHD files stored on a local XFS filesystem, created inside an LVM volume', + 'vendor': 'Vates SAS', + 'copyright': '(C) 2019 Vates SAS', + 'driver_version': '1.0', + 'required_api_version': '1.0', + 'capabilities': CAPABILITIES, + 'configuration': CONFIGURATION + } + +DRIVER_CONFIG = {"ATTACH_FROM_CONFIG_WITH_TAPDISK": True} + + +class XFSSR(FileSR.FileSR): + DRIVER_TYPE = 'xfs' + + """XFS Local file storage repository""" + def handles(srtype): + return srtype == XFSSR.DRIVER_TYPE + handles = staticmethod(handles) + + def load(self, sr_uuid): + if not self._is_xfs_available(): + raise xs_errors.XenError( + 'SRUnavailable', + opterr='xfsprogs is not installed' + ) + + self.ops_exclusive = FileSR.OPS_EXCLUSIVE + self.lock = Lock(vhdutil.LOCK_TYPE_SR, self.uuid) + self.sr_vditype = SR.DEFAULT_TAP + if not self.dconf.has_key('device') or not self.dconf['device']: + raise xs_errors.XenError('ConfigDeviceMissing') + + self.root = self.dconf['device'] + self.path = os.path.join(SR.MOUNT_BASE, sr_uuid) + self.vgname = EXT_PREFIX + sr_uuid + self.remotepath = os.path.join("/dev",self.vgname,sr_uuid) + self.attached = self._checkmount() + self.driver_config = DRIVER_CONFIG + + def delete(self, sr_uuid): + super(XFSSR, self).delete(sr_uuid) + + # Check PVs match VG + try: + for dev in self.root.split(','): + cmd = ["pvs", dev] + txt = util.pread2(cmd) + if txt.find(self.vgname) == -1: + raise xs_errors.XenError('VolNotFound', \ + opterr='volume is %s' % self.vgname) + except util.CommandException, inst: + raise xs_errors.XenError('PVSfailed', \ + opterr='error is %d' % inst.code) + + # Remove LV, VG and pv + try: + cmd = ["lvremove", "-f", self.remotepath] + util.pread2(cmd) + + cmd = ["vgremove", self.vgname] + util.pread2(cmd) + + for dev in self.root.split(','): + cmd = ["pvremove", dev] + util.pread2(cmd) + except util.CommandException, inst: + raise xs_errors.XenError('LVMDelete', \ + opterr='errno is %d' % inst.code) + + def attach(self, sr_uuid): + if not self._checkmount(): + try: + #Activate LV + cmd = ['lvchange','-ay',self.remotepath] + util.pread2(cmd) + + # make a mountpoint: + if not os.path.isdir(self.path): + os.makedirs(self.path) + except util.CommandException, inst: + raise xs_errors.XenError('LVMMount', \ + opterr='Unable to activate LV. Errno is %d' % inst.code) + + try: + util.pread(["fsck", "-a", self.remotepath]) + except util.CommandException, inst: + if inst.code == 1: + util.SMlog("FSCK detected and corrected FS errors. Not fatal.") + else: + raise xs_errors.XenError('LVMMount', \ + opterr='FSCK failed on %s. Errno is %d' % (self.remotepath,inst.code)) + + try: + util.pread(["mount", self.remotepath, self.path]) + except util.CommandException, inst: + raise xs_errors.XenError('LVMMount', \ + opterr='Failed to mount FS. Errno is %d' % inst.code) + + self.attached = True + + #Update SCSIid string + scsiutil.add_serial_record(self.session, self.sr_ref, \ + scsiutil.devlist_to_serialstring(self.root.split(','))) + + # Set the block scheduler + for dev in self.root.split(','): self.block_setscheduler(dev) + + def detach(self, sr_uuid): + super(XFSSR, self).detach(sr_uuid) + try: + # deactivate SR + cmd = ["lvchange", "-an", self.remotepath] + util.pread2(cmd) + except util.CommandException, inst: + raise xs_errors.XenError('LVMUnMount', \ + opterr='lvm -an failed errno is %d' % inst.code) + + def probe(self): + return lvutil.srlist_toxml(lvutil.scan_srlist(EXT_PREFIX, self.root), + EXT_PREFIX) + + def create(self, sr_uuid, size): + if self._checkmount(): + raise xs_errors.XenError('SRExists') + + # Check none of the devices already in use by other PBDs + if util.test_hostPBD_devs(self.session, sr_uuid, self.root): + raise xs_errors.XenError('SRInUse') + + # Check serial number entry in SR records + for dev in self.root.split(','): + if util.test_scsiserial(self.session, dev): + raise xs_errors.XenError('SRInUse') + + if not lvutil._checkVG(self.vgname): + lvutil.createVG(self.root, self.vgname) + + if lvutil._checkLV(self.remotepath): + raise xs_errors.XenError('SRExists') + + try: + numdevs = len(self.root.split(',')) + cmd = ["lvcreate", "-n", sr_uuid] + if numdevs > 1: + lowest = -1 + for dev in self.root.split(','): + stats = lvutil._getPVstats(dev) + if lowest < 0 or stats['freespace'] < lowest: + lowest = stats['freespace'] + size_mb = (lowest / (1024 * 1024)) * numdevs + + # Add stripe parameter to command + cmd += ["-i", str(numdevs), "-I", "2048"] + else: + stats = lvutil._getVGstats(self.vgname) + size_mb = stats['freespace'] / (1024 * 1024) + assert(size_mb > 0) + cmd += ["-L", str(size_mb), self.vgname] + text = util.pread(cmd) + + cmd = ["lvchange", "-ay", self.remotepath] + text = util.pread(cmd) + except util.CommandException, inst: + raise xs_errors.XenError('LVMCreate', \ + opterr='lv operation, error %d' % inst.code) + except AssertionError: + raise xs_errors.XenError('SRNoSpace', \ + opterr='Insufficient space in VG %s' % self.vgname) + + try: + util.pread2(["mkfs.xfs", self.remotepath]) + except util.CommandException, inst: + raise xs_errors.XenError('LVMFilesystem', \ + opterr='mkfs failed error %d' % inst.code) + + #Update serial number string + scsiutil.add_serial_record(self.session, self.sr_ref, \ + scsiutil.devlist_to_serialstring(self.root.split(','))) + + def vdi(self, uuid, loadLocked=False): + return XFSFileVDI(self, uuid) + + @staticmethod + def _is_xfs_available(): + import distutils.spawn + return distutils.spawn.find_executable('mkfs.xfs') + + +class XFSFileVDI(FileSR.FileVDI): + def attach(self, sr_uuid, vdi_uuid): + if not hasattr(self,'xenstore_data'): + self.xenstore_data = {} + + self.xenstore_data['storage-type'] = XFSSR.DRIVER_TYPE + + return super(XFSFileVDI, self).attach(sr_uuid, vdi_uuid) + + +if __name__ == '__main__': + SRCommand.run(XFSSR, DRIVER_INFO) +else: + SR.registerSR(XFSSR) diff --git a/drivers/cleanup.py b/drivers/cleanup.py index 97c332c8..ad1ee862 100755 --- a/drivers/cleanup.py +++ b/drivers/cleanup.py @@ -2765,7 +2765,10 @@ def normalizeType(type): if type in ["lvm", "lvmoiscsi", "lvmohba", "lvmofcoe"]: # temporary while LVHD is symlinked as LVM type = SR.TYPE_LVHD - if type in ["ext", "nfs", "ocfsoiscsi", "ocfsohba", "smb"]: + if type in [ + "ext", "nfs", "ocfsoiscsi", "ocfsohba", "smb", "cephfs", "glusterfs", + "xfs" + ]: type = SR.TYPE_FILE if not type in SR.TYPES: raise util.SMException("Unsupported SR type: %s" % type) From dbfbe5fb194286c502a2e40fe6d6f4a1eb82526e Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Wed, 12 Aug 2020 11:14:33 +0200 Subject: [PATCH 005/133] feat(drivers): add ZFS driver to avoid losing VDI metadata (xcp-ng/xcp#401) --- Makefile | 1 + drivers/XE_SR_ERRORCODES.xml | 10 +++ drivers/ZFSSR.py | 137 +++++++++++++++++++++++++++++++++++ drivers/cleanup.py | 2 +- 4 files changed, 149 insertions(+), 1 deletion(-) create mode 100644 drivers/ZFSSR.py diff --git a/Makefile b/Makefile index 0a10470a..f93b7be5 100755 --- a/Makefile +++ b/Makefile @@ -20,6 +20,7 @@ SM_DRIVERS += LVHDoFCoE SM_DRIVERS += CephFS SM_DRIVERS += GlusterFS SM_DRIVERS += XFS +SM_DRIVERS += ZFS SM_LIBS := SR SM_LIBS += SRCommand diff --git a/drivers/XE_SR_ERRORCODES.xml b/drivers/XE_SR_ERRORCODES.xml index 97236fe0..1f58adca 100755 --- a/drivers/XE_SR_ERRORCODES.xml +++ b/drivers/XE_SR_ERRORCODES.xml @@ -887,5 +887,15 @@ 1200 + + ZFSSRCreate + ZFS SR creation error + 5000 + + + ZFSSRDelete + ZFS SR deletion error + 5001 + diff --git a/drivers/ZFSSR.py b/drivers/ZFSSR.py new file mode 100644 index 00000000..1b2f398f --- /dev/null +++ b/drivers/ZFSSR.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python +# +# Copyright (C) 2020 Vates SAS +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import SR +import SRCommand + +import FileSR + +import util +import xs_errors + +CAPABILITIES = [ + 'SR_PROBE', + 'SR_UPDATE', + 'VDI_CREATE', + 'VDI_DELETE', + 'VDI_ATTACH', + 'VDI_DETACH', + 'VDI_CLONE', + 'VDI_SNAPSHOT', + 'VDI_RESIZE', + 'VDI_MIRROR', + 'VDI_GENERATE_CONFIG', + 'ATOMIC_PAUSE', + 'VDI_CONFIG_CBT', + 'VDI_ACTIVATE', + 'VDI_DEACTIVATE', + 'THIN_PROVISIONING' +] + +CONFIGURATION = [ + ['location', 'local ZFS directory path (required)'] +] + +DRIVER_INFO = { + 'name': 'Local ZFS VHD', + 'description': + 'SR plugin which represents disks as VHD files stored on a ZFS disk', + 'vendor': 'Vates SAS', + 'copyright': '(C) 2020 Vates SAS', + 'driver_version': '1.0', + 'required_api_version': '1.0', + 'capabilities': CAPABILITIES, + 'configuration': CONFIGURATION +} + + +class ZFSSR(FileSR.FileSR): + DRIVER_TYPE = 'zfs' + + @staticmethod + def handles(type): + return type == ZFSSR.DRIVER_TYPE + + def load(self, sr_uuid): + if not self._is_zfs_available(): + raise xs_errors.XenError( + 'SRUnavailable', + opterr='zfs is not installed or module is not loaded' + ) + return super(ZFSSR, self).load(sr_uuid) + + def create(self, sr_uuid, size): + if not self._is_zfs_path(self.remotepath): + raise xs_errors.XenError( + 'ZFSSRCreate', + opterr='Cannot create SR, path is not a ZFS mountpoint' + ) + return super(ZFSSR, self).create(sr_uuid, size) + + def delete(self, sr_uuid): + if not self._checkmount(): + raise xs_errors.XenError( + 'ZFSSRDelete', + opterr='ZFS SR is not mounted or uses an invalid FS type' + ) + return super(ZFSSR, self).delete(sr_uuid) + + def attach(self, sr_uuid): + if not self._is_zfs_path(self.remotepath): + raise xs_errors.XenError( + 'SRUnavailable', + opterr='Invalid ZFS path' + ) + return super(ZFSSR, self).attach(sr_uuid) + + def detach(self, sr_uuid): + return super(ZFSSR, self).detach(sr_uuid) + + def vdi(self, uuid, loadLocked=False): + return ZFSFileVDI(self, uuid) + + # Ensure _checkmount is overridden to prevent bad behaviors in FileSR. + def _checkmount(self): + return super(ZFSSR, self)._checkmount() and \ + self._is_zfs_path(self.remotepath) + + @staticmethod + def _is_zfs_path(path): + cmd = ['findmnt', '-o', 'FSTYPE', '-n', path] + fs_type = util.pread2(cmd).split('\n')[0] + return fs_type == 'zfs' + + @staticmethod + def _is_zfs_available(): + import distutils.spawn + return distutils.spawn.find_executable('zfs') and \ + util.pathexists('/sys/module/zfs/initstate') + + +class ZFSFileVDI(FileSR.FileVDI): + def attach(self, sr_uuid, vdi_uuid): + if not hasattr(self, 'xenstore_data'): + self.xenstore_data = {} + + self.xenstore_data['storage-type'] = ZFSSR.DRIVER_TYPE + + return super(ZFSFileVDI, self).attach(sr_uuid, vdi_uuid) + + +if __name__ == '__main__': + SRCommand.run(ZFSSR, DRIVER_INFO) +else: + SR.registerSR(ZFSSR) diff --git a/drivers/cleanup.py b/drivers/cleanup.py index ad1ee862..327103f4 100755 --- a/drivers/cleanup.py +++ b/drivers/cleanup.py @@ -2767,7 +2767,7 @@ def normalizeType(type): type = SR.TYPE_LVHD if type in [ "ext", "nfs", "ocfsoiscsi", "ocfsohba", "smb", "cephfs", "glusterfs", - "xfs" + "xfs", "zfs" ]: type = SR.TYPE_FILE if not type in SR.TYPES: From 696bd1cac5552ff3559fa229bc376cfb0fbe4092 Mon Sep 17 00:00:00 2001 From: Samuel Verschelde Date: Thu, 13 Aug 2020 17:10:12 +0200 Subject: [PATCH 006/133] Re-add the ext4 driver for users who need to transition The driver is needed to transition to the ext driver. Users who upgrade from XCP-ng <= 8.0 need a working driver so that they can move the VMs out of the ext4 SR and delete the SR. Not keeping that driver would force such users to upgrade to 8.1 first, convert their SR, then upgrade to a higher version. However, like in XCP-ng 8.1, the driver will refuse any new ext4 SR creation. --- Makefile | 1 + drivers/EXT4SR.py | 235 +++++++++++++++++++++++++++++++++++++++++++++ drivers/cleanup.py | 2 +- 3 files changed, 237 insertions(+), 1 deletion(-) create mode 100644 drivers/EXT4SR.py diff --git a/Makefile b/Makefile index f93b7be5..9bc0a9d8 100755 --- a/Makefile +++ b/Makefile @@ -21,6 +21,7 @@ SM_DRIVERS += CephFS SM_DRIVERS += GlusterFS SM_DRIVERS += XFS SM_DRIVERS += ZFS +SM_DRIVERS += EXT4 SM_LIBS := SR SM_LIBS += SRCommand diff --git a/drivers/EXT4SR.py b/drivers/EXT4SR.py new file mode 100644 index 00000000..bd67d003 --- /dev/null +++ b/drivers/EXT4SR.py @@ -0,0 +1,235 @@ +#!/usr/bin/env python +# +# Original work copyright (C) Citrix systems +# Modified work copyright (C) Vates SAS and XCP-ng community +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published +# by the Free Software Foundation; version 2.1 only. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +# +# EXT4SR: Based on local-file storage repository, mounts ext4 partition + +import SR, SRCommand, FileSR, util, lvutil, scsiutil + +import os +import xs_errors +import vhdutil +from lock import Lock +from constants import EXT_PREFIX + +CAPABILITIES = ["SR_PROBE","SR_UPDATE", "SR_SUPPORTS_LOCAL_CACHING", \ + "VDI_CREATE","VDI_DELETE","VDI_ATTACH","VDI_DETACH", \ + "VDI_UPDATE","VDI_CLONE","VDI_SNAPSHOT","VDI_RESIZE","VDI_MIRROR", \ + "VDI_GENERATE_CONFIG", \ + "VDI_RESET_ON_BOOT/2","ATOMIC_PAUSE", "VDI_CONFIG_CBT", + "VDI_ACTIVATE", "VDI_DEACTIVATE"] + +CONFIGURATION = [ [ 'device', 'local device path (required) (e.g. /dev/sda3)' ] ] + +DRIVER_INFO = { + 'name': 'Local EXT4 VHD', + 'description': 'SR plugin which represents disks as VHD files stored on a local EXT4 filesystem, created inside an LVM volume', + 'vendor': 'Vates SAS', + 'copyright': '(C) 2019 Vates SAS', + 'driver_version': '1.0', + 'required_api_version': '1.0', + 'capabilities': CAPABILITIES, + 'configuration': CONFIGURATION + } + +DRIVER_CONFIG = {"ATTACH_FROM_CONFIG_WITH_TAPDISK": True} + +class EXT4SR(FileSR.FileSR): + """EXT4 Local file storage repository""" + def handles(srtype): + return srtype == 'ext4' + handles = staticmethod(handles) + + def load(self, sr_uuid): + self.ops_exclusive = FileSR.OPS_EXCLUSIVE + self.lock = Lock(vhdutil.LOCK_TYPE_SR, self.uuid) + self.sr_vditype = SR.DEFAULT_TAP + if not self.dconf.has_key('device') or not self.dconf['device']: + raise xs_errors.XenError('ConfigDeviceMissing') + + self.root = self.dconf['device'] + self.path = os.path.join(SR.MOUNT_BASE, sr_uuid) + self.vgname = EXT_PREFIX + sr_uuid + self.remotepath = os.path.join("/dev",self.vgname,sr_uuid) + self.attached = self._checkmount() + self.driver_config = DRIVER_CONFIG + + def delete(self, sr_uuid): + super(EXT4SR, self).delete(sr_uuid) + + # Check PVs match VG + try: + for dev in self.root.split(','): + cmd = ["pvs", dev] + txt = util.pread2(cmd) + if txt.find(self.vgname) == -1: + raise xs_errors.XenError('VolNotFound', \ + opterr='volume is %s' % self.vgname) + except util.CommandException, inst: + raise xs_errors.XenError('PVSfailed', \ + opterr='error is %d' % inst.code) + + # Remove LV, VG and pv + try: + cmd = ["lvremove", "-f", self.remotepath] + util.pread2(cmd) + + cmd = ["vgremove", self.vgname] + util.pread2(cmd) + + for dev in self.root.split(','): + cmd = ["pvremove", dev] + util.pread2(cmd) + except util.CommandException, inst: + raise xs_errors.XenError('LVMDelete', \ + opterr='errno is %d' % inst.code) + + def attach(self, sr_uuid): + if not self._checkmount(): + try: + #Activate LV + cmd = ['lvchange','-ay',self.remotepath] + util.pread2(cmd) + + # make a mountpoint: + if not os.path.isdir(self.path): + os.makedirs(self.path) + except util.CommandException, inst: + raise xs_errors.XenError('LVMMount', \ + opterr='Unable to activate LV. Errno is %d' % inst.code) + + try: + util.pread(["fsck", "-a", self.remotepath]) + except util.CommandException, inst: + if inst.code == 1: + util.SMlog("FSCK detected and corrected FS errors. Not fatal.") + else: + raise xs_errors.XenError('LVMMount', \ + opterr='FSCK failed on %s. Errno is %d' % (self.remotepath,inst.code)) + + try: + util.pread(["mount", self.remotepath, self.path]) + except util.CommandException, inst: + raise xs_errors.XenError('LVMMount', \ + opterr='Failed to mount FS. Errno is %d' % inst.code) + + self.attached = True + + #Update SCSIid string + scsiutil.add_serial_record(self.session, self.sr_ref, \ + scsiutil.devlist_to_serialstring(self.root.split(','))) + + # Set the block scheduler + for dev in self.root.split(','): self.block_setscheduler(dev) + + def detach(self, sr_uuid): + super(EXT4SR, self).detach(sr_uuid) + try: + # deactivate SR + cmd = ["lvchange", "-an", self.remotepath] + util.pread2(cmd) + except util.CommandException, inst: + raise xs_errors.XenError('LVMUnMount', \ + opterr='lvm -an failed errno is %d' % inst.code) + + def probe(self): + return lvutil.srlist_toxml(lvutil.scan_srlist(EXT_PREFIX, self.root), + EXT_PREFIX) + + def create(self, sr_uuid, size): + # THIS DRIVER IS DEPRECATED. RAISE. + raise Exception('The `ext4` SR type is deprecated since XCP-ng 8.1.\n' + 'Use the main `ext` driver instead. It will create an EXT4 filesystem now, ' + 'not EXT3 anymore as it used to.') + + if self._checkmount(): + raise xs_errors.XenError('SRExists') + + # Check none of the devices already in use by other PBDs + if util.test_hostPBD_devs(self.session, sr_uuid, self.root): + raise xs_errors.XenError('SRInUse') + + # Check serial number entry in SR records + for dev in self.root.split(','): + if util.test_scsiserial(self.session, dev): + raise xs_errors.XenError('SRInUse') + + if not lvutil._checkVG(self.vgname): + lvutil.createVG(self.root, self.vgname) + + if lvutil._checkLV(self.remotepath): + raise xs_errors.XenError('SRExists') + + try: + numdevs = len(self.root.split(',')) + cmd = ["lvcreate", "-n", sr_uuid] + if numdevs > 1: + lowest = -1 + for dev in self.root.split(','): + stats = lvutil._getPVstats(dev) + if lowest < 0 or stats['freespace'] < lowest: + lowest = stats['freespace'] + size_mb = (lowest / (1024 * 1024)) * numdevs + + # Add stripe parameter to command + cmd += ["-i", str(numdevs), "-I", "2048"] + else: + stats = lvutil._getVGstats(self.vgname) + size_mb = stats['freespace'] / (1024 * 1024) + assert(size_mb > 0) + cmd += ["-L", str(size_mb), self.vgname] + text = util.pread(cmd) + + cmd = ["lvchange", "-ay", self.remotepath] + text = util.pread(cmd) + except util.CommandException, inst: + raise xs_errors.XenError('LVMCreate', \ + opterr='lv operation, error %d' % inst.code) + except AssertionError: + raise xs_errors.XenError('SRNoSpace', \ + opterr='Insufficient space in VG %s' % self.vgname) + + try: + util.pread2(["mkfs.ext4", "-F", self.remotepath]) + except util.CommandException, inst: + raise xs_errors.XenError('LVMFilesystem', \ + opterr='mkfs failed error %d' % inst.code) + + #Update serial number string + scsiutil.add_serial_record(self.session, self.sr_ref, \ + scsiutil.devlist_to_serialstring(self.root.split(','))) + + def vdi(self, uuid, loadLocked = False): + if not loadLocked: + return EXTFileVDI(self, uuid) + return EXTFileVDI(self, uuid) + + +class EXTFileVDI(FileSR.FileVDI): + def attach(self, sr_uuid, vdi_uuid): + if not hasattr(self,'xenstore_data'): + self.xenstore_data = {} + + self.xenstore_data["storage-type"]="ext" + + return super(EXTFileVDI, self).attach(sr_uuid, vdi_uuid) + + +if __name__ == '__main__': + SRCommand.run(EXT4SR, DRIVER_INFO) +else: + SR.registerSR(EXT4SR) \ No newline at end of file diff --git a/drivers/cleanup.py b/drivers/cleanup.py index 327103f4..867c7896 100755 --- a/drivers/cleanup.py +++ b/drivers/cleanup.py @@ -2767,7 +2767,7 @@ def normalizeType(type): type = SR.TYPE_LVHD if type in [ "ext", "nfs", "ocfsoiscsi", "ocfsohba", "smb", "cephfs", "glusterfs", - "xfs", "zfs" + "xfs", "zfs", "ext4" ]: type = SR.TYPE_FILE if not type in SR.TYPES: From b06f04fcf8a8aefe69d4300cbaa3fc9821dd8100 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Mon, 16 Mar 2020 15:39:44 +0100 Subject: [PATCH 007/133] feat(drivers): add LinstorSR driver Some important points: - linstor.KV must use an identifier name that starts with a letter (so it uses a "sr-" prefix). - Encrypted VDI are supported with key_hash attribute (not tested, experimental). - When a new LINSTOR volume is created on a host (via snapshot or create), the remaining diskless devices are not necessarily created on other hosts. So if a resource definition exists without local device path, we ask it to LINSTOR. Wait 5s for symlink creation when a new volume is created => 5s is is purely arbitrary, but this guarantees that we do not try to access the volume if the symlink has not yet been created by the udev rule. - Can change the provisioning using the device config 'provisioning' param. - We can only increase volume size (See: https://github.com/LINBIT/linstor-server/issues/66), it would be great if we could shrink volumes to limit the space used by the snapshots. - Inflate/Deflate can only be executed on the master host, a linstor-manager plugin is present to do this from slaves. The same plugin is used to open LINSTOR ports + start controller. - Use a `total_allocated_volume_size` method to have a good idea of the reserved memory Why? Because `physical_free_size` is computed using the LVM used size, in the case of thick provisioning it's ok, but when thin provisioning is choosen LVM returns only the allocated size using the used block count. So this method solves this problem, it takes the fixed virtual volume size of each node to compute the required size to store the volume data. - Call vhd-util on remote hosts using the linstor-manager when necessary, i.e. vhd-util is called to get vhd info, the DRBD device can be in use (and unusable by external processes), so we must use the local LVM device that contains the DRBD data or a remote disk if the DRBD device is diskless. - If a DRBD device is in use when vhdutil.getVHDInfo is called, we must have no errors. So a LinstorVhdUtil wrapper is now used to bypass DRBD layer when VDIs are loaded. - Refresh PhyLink when unpause in called on DRBD devices: We must always recreate the symlink to ensure we have the right info. Why? Because if the volume UUID is changed in LINSTOR the symlink is not directly updated. When live leaf coalesce is executed we have these steps: "A" -> "OLD_A" "B" -> "A" Without symlink update the previous "A" path is reused instead of "B" path. Note: "A", "B" and "OLD_A" are UUIDs. - Since linstor python modules are not present on every XCP-ng host, module imports are protected by try.. except... blocks. - Provide a linstor-monitor daemon to check master changes --- Makefile | 10 +- drivers/LinstorSR.py | 2100 +++++++++++++++++++++++++++++++ drivers/XE_SR_ERRORCODES.xml | 36 + drivers/cleanup.py | 366 +++++- drivers/linstor-manager | 272 ++++ drivers/linstorjournaler.py | 155 +++ drivers/linstorvhdutil.py | 186 +++ drivers/linstorvolumemanager.py | 1713 +++++++++++++++++++++++++ drivers/tapdisk-pause | 70 +- drivers/util.py | 46 +- linstor/Makefile | 22 + linstor/linstor-monitord.c | 402 ++++++ systemd/linstor-monitor.service | 13 + tests/mocks/linstor/__init__.py | 0 14 files changed, 5361 insertions(+), 30 deletions(-) create mode 100755 drivers/LinstorSR.py create mode 100755 drivers/linstor-manager create mode 100755 drivers/linstorjournaler.py create mode 100644 drivers/linstorvhdutil.py create mode 100755 drivers/linstorvolumemanager.py create mode 100644 linstor/Makefile create mode 100644 linstor/linstor-monitord.c create mode 100644 systemd/linstor-monitor.service create mode 100644 tests/mocks/linstor/__init__.py diff --git a/Makefile b/Makefile index 9bc0a9d8..609e1828 100755 --- a/Makefile +++ b/Makefile @@ -8,6 +8,7 @@ SM_DRIVERS += udev SM_DRIVERS += ISO SM_DRIVERS += HBA SM_DRIVERS += RawHBA +SM_DRIVERS += Linstor SM_DRIVERS += LVHD SM_DRIVERS += LVHDoISCSI SM_DRIVERS += LVHDoHBA @@ -35,6 +36,9 @@ SM_LIBS += verifyVHDsOnSR SM_LIBS += scsiutil SM_LIBS += scsi_host_rescan SM_LIBS += vhdutil +SM_LIBS += linstorjournaler +SM_LIBS += linstorvhdutil +SM_LIBS += linstorvolumemanager SM_LIBS += lvhdutil SM_LIBS += cifutils SM_LIBS += xs_errors @@ -100,6 +104,7 @@ SM_PY_FILES = $(foreach LIB, $(SM_LIBS), drivers/$(LIB).py) $(foreach DRIVER, $( .PHONY: build build: make -C dcopy + make -C linstor .PHONY: precommit precommit: build @@ -179,6 +184,8 @@ install: precheck $(SM_STAGING)/$(SYSTEMD_SERVICE_DIR) install -m 644 systemd/sr_health_check.timer \ $(SM_STAGING)/$(SYSTEMD_SERVICE_DIR) + install -m 644 systemd/linstor-monitor.service \ + $(SM_STAGING)/$(SYSTEMD_SERVICE_DIR) for i in $(UDEV_RULES); do \ install -m 644 udev/$$i.rules \ $(SM_STAGING)$(UDEV_RULES_DIR); done @@ -199,6 +206,7 @@ install: precheck cd $(SM_STAGING)$(SM_DEST) && rm -f OCFSoHBASR ln -sf $(SM_DEST)mpathutil.py $(SM_STAGING)/sbin/mpathutil install -m 755 drivers/02-vhdcleanup $(SM_STAGING)$(MASTER_SCRIPT_DEST) + install -m 755 drivers/linstor-manager $(SM_STAGING)$(PLUGIN_SCRIPT_DEST) install -m 755 drivers/lvhd-thin $(SM_STAGING)$(PLUGIN_SCRIPT_DEST) install -m 755 drivers/on_slave.py $(SM_STAGING)$(PLUGIN_SCRIPT_DEST)/on-slave install -m 755 drivers/testing-hooks $(SM_STAGING)$(PLUGIN_SCRIPT_DEST) @@ -217,6 +225,7 @@ install: precheck install -m 755 scripts/kickpipe $(SM_STAGING)$(LIBEXEC) install -m 755 scripts/set-iscsi-initiator $(SM_STAGING)$(LIBEXEC) $(MAKE) -C dcopy install DESTDIR=$(SM_STAGING) + $(MAKE) -C linstor install DESTDIR=$(SM_STAGING) ln -sf $(SM_DEST)blktap2.py $(SM_STAGING)$(BIN_DEST)/blktap2 ln -sf $(SM_DEST)lcache.py $(SM_STAGING)$(BIN_DEST)tapdisk-cache-stats ln -sf /dev/null $(SM_STAGING)$(UDEV_RULES_DIR)/69-dm-lvm-metad.rules @@ -230,4 +239,3 @@ install: precheck .PHONY: clean clean: rm -rf $(SM_STAGING) - diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py new file mode 100755 index 00000000..8be18367 --- /dev/null +++ b/drivers/LinstorSR.py @@ -0,0 +1,2100 @@ +#!/usr/bin/env python +# +# Copyright (C) 2020 Vates SAS - ronan.abhamon@vates.fr +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from constants import CBTLOG_TAG + +try: + from linstorjournaler import LinstorJournaler + from linstorvhdutil import LinstorVhdUtil + from linstorvolumemanager \ + import LinstorVolumeManager, LinstorVolumeManagerError + LINSTOR_AVAILABLE = True +except ImportError: + LINSTOR_AVAILABLE = False + +from lock import Lock +import blktap2 +import cleanup +import errno +import functools +import scsiutil +import SR +import SRCommand +import time +import traceback +import util +import VDI +import vhdutil +import xmlrpclib +import xs_errors + +from srmetadata import \ + NAME_LABEL_TAG, NAME_DESCRIPTION_TAG, IS_A_SNAPSHOT_TAG, SNAPSHOT_OF_TAG, \ + TYPE_TAG, VDI_TYPE_TAG, READ_ONLY_TAG, SNAPSHOT_TIME_TAG, \ + METADATA_OF_POOL_TAG + +HIDDEN_TAG = 'hidden' + +# ============================================================================== + +# TODO: Supports 'VDI_INTRODUCE', 'VDI_RESET_ON_BOOT/2', 'SR_TRIM', +# 'VDI_CONFIG_CBT', 'SR_PROBE' + +CAPABILITIES = [ + 'ATOMIC_PAUSE', + 'SR_UPDATE', + 'VDI_CREATE', + 'VDI_DELETE', + 'VDI_UPDATE', + 'VDI_ATTACH', + 'VDI_DETACH', + 'VDI_ACTIVATE', + 'VDI_DEACTIVATE', + 'VDI_CLONE', + 'VDI_MIRROR', + 'VDI_RESIZE', + 'VDI_SNAPSHOT', + 'VDI_GENERATE_CONFIG' +] + +CONFIGURATION = [ + ['group-name', 'LVM group name'], + ['hosts', 'host names to use'], + ['redundancy', 'replication count'], + ['provisioning', '"thin" or "thick" are accepted'] +] + +DRIVER_INFO = { + 'name': 'LINSTOR resources on XCP-ng', + 'description': 'SR plugin which uses Linstor to manage VDIs', + 'vendor': 'Vates', + 'copyright': '(C) 2020 Vates', + 'driver_version': '1.0', + 'required_api_version': '1.0', + 'capabilities': CAPABILITIES, + 'configuration': CONFIGURATION +} + +DRIVER_CONFIG = {'ATTACH_FROM_CONFIG_WITH_TAPDISK': False} + +OPS_EXCLUSIVE = [ + 'sr_create', 'sr_delete', 'sr_attach', 'sr_detach', 'sr_scan', + 'sr_update', 'vdi_create', 'vdi_delete', 'vdi_clone', 'vdi_snapshot' +] + +# ============================================================================== +# Misc helpers used by LinstorSR and linstor-thin plugin. +# ============================================================================== + + +def compute_volume_size(virtual_size, image_type): + if image_type == vhdutil.VDI_TYPE_VHD: + # All LINSTOR VDIs have the metadata area preallocated for + # the maximum possible virtual size (for fast online VDI.resize). + meta_overhead = vhdutil.calcOverheadEmpty(LinstorVDI.MAX_SIZE) + bitmap_overhead = vhdutil.calcOverheadBitmap(virtual_size) + virtual_size += meta_overhead + bitmap_overhead + elif image_type != vhdutil.VDI_TYPE_RAW: + raise Exception('Invalid image type: {}'.format(image_type)) + + return LinstorVolumeManager.round_up_volume_size(virtual_size) + + +def try_lock(lock): + for i in range(20): + if lock.acquireNoblock(): + return + time.sleep(1) + raise util.SRBusyException() + + +def attach_thin(session, journaler, linstor, sr_uuid, vdi_uuid): + volume_metadata = linstor.get_volume_metadata(vdi_uuid) + image_type = volume_metadata.get(VDI_TYPE_TAG) + if image_type == vhdutil.VDI_TYPE_RAW: + return + + lock = Lock(vhdutil.LOCK_TYPE_SR, sr_uuid) + try: + try_lock(lock) + + device_path = linstor.get_device_path(vdi_uuid) + + # If the virtual VHD size is lower than the LINSTOR volume size, + # there is nothing to do. + vhd_size = compute_volume_size( + LinstorVhdUtil(session, linstor).get_size_virt(vdi_uuid), + image_type + ) + + volume_info = linstor.get_volume_info(vdi_uuid) + volume_size = volume_info.virtual_size + + if vhd_size > volume_size: + inflate( + journaler, linstor, vdi_uuid, device_path, + vhd_size, volume_size + ) + finally: + lock.release() + + +def detach_thin(session, linstor, sr_uuid, vdi_uuid): + volume_metadata = linstor.get_volume_metadata(vdi_uuid) + image_type = volume_metadata.get(VDI_TYPE_TAG) + if image_type == vhdutil.VDI_TYPE_RAW: + return + + lock = Lock(vhdutil.LOCK_TYPE_SR, sr_uuid) + try: + try_lock(lock) + + vdi_ref = session.xenapi.VDI.get_by_uuid(vdi_uuid) + vbds = session.xenapi.VBD.get_all_records_where( + 'field "VDI" = "{}"'.format(vdi_ref) + ) + + num_plugged = 0 + for vbd_rec in vbds.values(): + if vbd_rec['currently_attached']: + num_plugged += 1 + if num_plugged > 1: + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='Cannot deflate VDI {}, already used by ' + 'at least 2 VBDs'.format(vdi_uuid) + ) + + device_path = linstor.get_device_path(vdi_uuid) + new_volume_size = LinstorVolumeManager.round_up_volume_size( + LinstorVhdUtil(session, linstor).get_size_phys(device_path) + ) + + volume_info = linstor.get_volume_info(vdi_uuid) + old_volume_size = volume_info.virtual_size + deflate(vdi_uuid, device_path, new_volume_size, old_volume_size) + finally: + lock.release() + + +def inflate(journaler, linstor, vdi_uuid, vdi_path, new_size, old_size): + # Only inflate if the LINSTOR volume capacity is not enough. + new_size = LinstorVolumeManager.round_up_volume_size(new_size) + if new_size <= old_size: + return + + util.SMlog( + 'Inflate {} (new VHD size={}, previous={})' + .format(vdi_uuid, new_size, old_size) + ) + + journaler.create( + LinstorJournaler.INFLATE, vdi_uuid, old_size + ) + linstor.resize_volume(vdi_uuid, new_size) + + if not util.zeroOut( + vdi_path, new_size - vhdutil.VHD_FOOTER_SIZE, + vhdutil.VHD_FOOTER_SIZE + ): + raise xs_errors.XenError( + 'EIO', + opterr='Failed to zero out VHD footer {}'.format(vdi_path) + ) + + vhdutil.setSizePhys(vdi_path, new_size, False) + journaler.remove(LinstorJournaler.INFLATE, vdi_uuid) + + +def deflate(vdi_uuid, vdi_path, new_size, old_size): + new_size = LinstorVolumeManager.round_up_volume_size(new_size) + if new_size >= old_size: + return + + util.SMlog( + 'Deflate {} (new size={}, previous={})' + .format(vdi_uuid, new_size, old_size) + ) + + vhdutil.setSizePhys(vdi_path, new_size) + # TODO: Change the LINSTOR volume size using linstor.resize_volume. + + +# ============================================================================== + +# Usage example: +# xe sr-create type=linstor name-label=linstor-sr +# host-uuid=d2deba7a-c5ad-4de1-9a20-5c8df3343e93 +# device-config:hosts=node-linstor1,node-linstor2,node-linstor3 +# device-config:group-name=vg_loop device-config:redundancy=2 + + +class LinstorSR(SR.SR): + DRIVER_TYPE = 'linstor' + + PROVISIONING_TYPES = ['thin', 'thick'] + PROVISIONING_DEFAULT = 'thin' + + MANAGER_PLUGIN = 'linstor-manager' + + # -------------------------------------------------------------------------- + # SR methods. + # -------------------------------------------------------------------------- + + @staticmethod + def handles(type): + return type == LinstorSR.DRIVER_TYPE + + def load(self, sr_uuid): + if not LINSTOR_AVAILABLE: + raise util.SMException( + 'Can\'t load LinstorSR: LINSTOR libraries are missing' + ) + + # Check parameters. + if 'hosts' not in self.dconf or not self.dconf['hosts']: + raise xs_errors.XenError('LinstorConfigHostsMissing') + if 'group-name' not in self.dconf or not self.dconf['group-name']: + raise xs_errors.XenError('LinstorConfigGroupNameMissing') + if 'redundancy' not in self.dconf or not self.dconf['redundancy']: + raise xs_errors.XenError('LinstorConfigRedundancyMissing') + + self.driver_config = DRIVER_CONFIG + + # Check provisioning config. + provisioning = self.dconf.get('provisioning') + if provisioning: + if provisioning in self.PROVISIONING_TYPES: + self._provisioning = provisioning + else: + raise xs_errors.XenError( + 'InvalidArg', + opterr='Provisioning parameter must be one of {}'.format( + self.PROVISIONING_TYPES + ) + ) + else: + self._provisioning = self.PROVISIONING_DEFAULT + + # Note: We don't have access to the session field if the + # 'vdi_attach_from_config' command is executed. + self._has_session = self.sr_ref and self.session is not None + if self._has_session: + self.sm_config = self.session.xenapi.SR.get_sm_config(self.sr_ref) + else: + self.sm_config = self.srcmd.params.get('sr_sm_config') or {} + + provisioning = self.sm_config.get('provisioning') + if provisioning in self.PROVISIONING_TYPES: + self._provisioning = provisioning + + # Define properties for SR parent class. + self.ops_exclusive = OPS_EXCLUSIVE + self.path = LinstorVolumeManager.DEV_ROOT_PATH + self.lock = Lock(vhdutil.LOCK_TYPE_SR, self.uuid) + self.sr_vditype = SR.DEFAULT_TAP + + self._hosts = self.dconf['hosts'].split(',') + self._redundancy = int(self.dconf['redundancy'] or 1) + self._linstor = None # Ensure that LINSTOR attribute exists. + self._journaler = None + + self._is_master = False + if 'SRmaster' in self.dconf and self.dconf['SRmaster'] == 'true': + self._is_master = True + self._group_name = self.dconf['group-name'] + + self._master_uri = None + self._vdi_shared_locked = False + + self._initialized = False + + def _locked_load(method): + @functools.wraps(method) + def wrap(self, *args, **kwargs): + if self._initialized: + return method(self, *args, **kwargs) + self._initialized = True + + if not self._has_session: + if self.srcmd.cmd == 'vdi_attach_from_config': + # We must have a valid LINSTOR instance here without using + # the XAPI. + self._master_uri = 'linstor://{}'.format( + util.get_master_address() + ) + self._journaler = LinstorJournaler( + self._master_uri, self._group_name, logger=util.SMlog + ) + + try: + self._linstor = LinstorVolumeManager( + self._master_uri, + self._group_name, + logger=util.SMlog + ) + return + except Exception as e: + util.SMlog( + 'Ignore exception. Failed to build LINSTOR ' + 'instance without session: {}'.format(e) + ) + return + + self._master_uri = 'linstor://{}'.format( + util.get_master_rec(self.session)['address'] + ) + + if not self._is_master: + if self.cmd in [ + 'sr_create', 'sr_delete', 'sr_update', 'sr_probe', + 'sr_scan', 'vdi_create', 'vdi_delete', 'vdi_resize', + 'vdi_snapshot', 'vdi_clone' + ]: + util.SMlog('{} blocked for non-master'.format(self.cmd)) + raise xs_errors.XenError('LinstorMaster') + + # Because the LINSTOR KV objects cache all values, we must lock + # the VDI before the LinstorJournaler/LinstorVolumeManager + # instantiation and before any action on the master to avoid a + # bad read. The lock is also necessary to avoid strange + # behaviors if the GC is executed during an action on a slave. + if self.cmd.startswith('vdi_'): + self._shared_lock_vdi(self.srcmd.params['vdi_uuid']) + self._vdi_shared_locked = True + + self._journaler = LinstorJournaler( + self._master_uri, self._group_name, logger=util.SMlog + ) + + # Ensure ports are opened and LINSTOR controller/satellite + # are activated. + if self.srcmd.cmd == 'sr_create': + # TODO: Disable if necessary + self._enable_linstor_on_all_hosts(status=True) + + try: + # Try to open SR if exists. + self._linstor = LinstorVolumeManager( + self._master_uri, + self._group_name, + repair=self._is_master, + logger=util.SMlog + ) + self._vhdutil = LinstorVhdUtil(self.session, self._linstor) + except Exception as e: + if self.srcmd.cmd == 'sr_create' or \ + self.srcmd.cmd == 'sr_detach': + # Ignore exception in this specific case: sr_create. + # At this moment the LinstorVolumeManager cannot be + # instantiated. Concerning the sr_detach command, we must + # ignore LINSTOR exceptions (if the volume group doesn't + # exist for example after a bad user action). + pass + else: + raise xs_errors.XenError('SRUnavailable', opterr=str(e)) + + if self._linstor: + try: + hosts = self._linstor.disconnected_hosts + except Exception as e: + raise xs_errors.XenError('SRUnavailable', opterr=str(e)) + + if hosts: + util.SMlog('Failed to join node(s): {}'.format(hosts)) + + try: + # If the command is a SR command on the master, we must + # load all VDIs and clean journal transactions. + # We must load the VDIs in the snapshot case too. + if self._is_master and self.cmd not in [ + 'vdi_attach', 'vdi_detach', + 'vdi_activate', 'vdi_deactivate', + 'vdi_epoch_begin', 'vdi_epoch_end', + 'vdi_update', 'vdi_destroy' + ]: + self._load_vdis() + self._undo_all_journal_transactions() + self._linstor.remove_resourceless_volumes() + + self._synchronize_metadata() + except Exception as e: + util.SMlog( + 'Ignoring exception in LinstorSR.load: {}'.format(e) + ) + util.SMlog(traceback.format_exc()) + + return method(self, *args, **kwargs) + + return wrap + + @_locked_load + def cleanup(self): + if self._vdi_shared_locked: + self._shared_lock_vdi(self.srcmd.params['vdi_uuid'], locked=False) + + @_locked_load + def create(self, uuid, size): + util.SMlog('LinstorSR.create for {}'.format(self.uuid)) + + if self._redundancy > len(self._hosts): + raise xs_errors.XenError( + 'LinstorSRCreate', + opterr='Redundancy greater than host count' + ) + + xenapi = self.session.xenapi + srs = xenapi.SR.get_all_records_where( + 'field "type" = "{}"'.format(self.DRIVER_TYPE) + ) + srs = dict(filter(lambda e: e[1]['uuid'] != self.uuid, srs.items())) + + for sr in srs.values(): + for pbd in sr['PBDs']: + device_config = xenapi.PBD.get_device_config(pbd) + group_name = device_config.get('group-name') + if group_name and group_name == self._group_name: + raise xs_errors.XenError( + 'LinstorSRCreate', + opterr='group name must be unique' + ) + + # Create SR. + # Throw if the SR already exists. + try: + self._linstor = LinstorVolumeManager.create_sr( + self._master_uri, + self._group_name, + self._hosts, + self._redundancy, + thin_provisioning=self._provisioning == 'thin', + logger=util.SMlog + ) + self._vhdutil = LinstorVhdUtil(self.session, self._linstor) + except Exception as e: + util.SMlog('Failed to create LINSTOR SR: {}'.format(e)) + raise xs_errors.XenError('LinstorSRCreate', opterr=str(e)) + + @_locked_load + def delete(self, uuid): + util.SMlog('LinstorSR.delete for {}'.format(self.uuid)) + cleanup.gc_force(self.session, self.uuid) + + if self.vdis: + raise xs_errors.XenError('SRNotEmpty') + + try: + # TODO: Use specific exceptions. If the LINSTOR group doesn't + # exist, we can remove it without problem. + + # TODO: Maybe remove all volumes unused by the SMAPI. + # We must ensure it's a safe idea... + + self._linstor.destroy() + Lock.cleanupAll(self.uuid) + except Exception as e: + util.SMlog('Failed to delete LINSTOR SR: {}'.format(e)) + raise xs_errors.XenError( + 'LinstorSRDelete', + opterr=str(e) + ) + + @_locked_load + def update(self, uuid): + util.SMlog('LinstorSR.update for {}'.format(self.uuid)) + + # Well, how can we update a SR if it doesn't exist? :thinking: + if not self._linstor: + raise xs_errors.XenError( + 'SRUnavailable', + opterr='no such volume group: {}'.format(self._group_name) + ) + + self._update_stats(0) + + # Update the SR name and description only in LINSTOR metadata. + xenapi = self.session.xenapi + self._linstor.metadata = { + NAME_LABEL_TAG: util.to_plain_string( + xenapi.SR.get_name_label(self.sr_ref) + ), + NAME_DESCRIPTION_TAG: util.to_plain_string( + xenapi.SR.get_name_description(self.sr_ref) + ) + } + + @_locked_load + def attach(self, uuid): + util.SMlog('LinstorSR.attach for {}'.format(self.uuid)) + + if not self._linstor: + raise xs_errors.XenError( + 'SRUnavailable', + opterr='no such group: {}'.format(self._group_name) + ) + + @_locked_load + def detach(self, uuid): + util.SMlog('LinstorSR.detach for {}'.format(self.uuid)) + cleanup.abort(self.uuid) + + @_locked_load + def probe(self): + util.SMlog('LinstorSR.probe for {}'.format(self.uuid)) + # TODO + + @_locked_load + def scan(self, uuid): + util.SMlog('LinstorSR.scan for {}'.format(self.uuid)) + if not self._linstor: + raise xs_errors.XenError( + 'SRUnavailable', + opterr='no such volume group: {}'.format(self._group_name) + ) + + self._update_physical_size() + + for vdi_uuid in self.vdis.keys(): + if self.vdis[vdi_uuid].deleted: + del self.vdis[vdi_uuid] + + # Update the database before the restart of the GC to avoid + # bad sync in the process if new VDIs have been introduced. + ret = super(LinstorSR, self).scan(self.uuid) + self._kick_gc() + return ret + + @_locked_load + def vdi(self, uuid): + return LinstorVDI(self, uuid) + + _locked_load = staticmethod(_locked_load) + + # -------------------------------------------------------------------------- + # Lock. + # -------------------------------------------------------------------------- + + def _shared_lock_vdi(self, vdi_uuid, locked=True): + pools = self.session.xenapi.pool.get_all() + master = self.session.xenapi.pool.get_master(pools[0]) + + method = 'lockVdi' + args = { + 'groupName': self._group_name, + 'srUuid': self.uuid, + 'vdiUuid': vdi_uuid, + 'locked': str(locked) + } + + ret = self.session.xenapi.host.call_plugin( + master, self.MANAGER_PLUGIN, method, args + ) + util.SMlog( + 'call-plugin ({} with {}) returned: {}' + .format(method, args, ret) + ) + if ret == 'False': + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='Plugin {} failed'.format(self.MANAGER_PLUGIN) + ) + + # -------------------------------------------------------------------------- + # Network. + # -------------------------------------------------------------------------- + + def _enable_linstor(self, host, status): + method = 'enable' + args = {'enabled': str(bool(status))} + + ret = self.session.xenapi.host.call_plugin( + host, self.MANAGER_PLUGIN, method, args + ) + util.SMlog( + 'call-plugin ({} with {}) returned: {}'.format(method, args, ret) + ) + if ret == 'False': + raise xs_errors.XenError( + 'SRUnavailable', + opterr='Plugin {} failed'.format(self.MANAGER_PLUGIN) + ) + + def _enable_linstor_on_master(self, status): + pools = self.session.xenapi.pool.get_all() + master = self.session.xenapi.pool.get_master(pools[0]) + self._enable_linstor(master, status) + + def _enable_linstor_on_all_hosts(self, status): + self._enable_linstor_on_master(status) + for slave in util.get_all_slaves(self.session): + self._enable_linstor(slave, status) + + # -------------------------------------------------------------------------- + # Metadata. + # -------------------------------------------------------------------------- + + def _synchronize_metadata_and_xapi(self): + try: + # First synch SR parameters. + self.update(self.uuid) + + # Now update the VDI information in the metadata if required. + xenapi = self.session.xenapi + volumes_metadata = self._linstor.volumes_with_metadata + for vdi_uuid, volume_metadata in volumes_metadata.items(): + try: + vdi_ref = xenapi.VDI.get_by_uuid(vdi_uuid) + except Exception: + # May be the VDI is not in XAPI yet dont bother. + continue + + label = util.to_plain_string( + xenapi.VDI.get_name_label(vdi_ref) + ) + description = util.to_plain_string( + xenapi.VDI.get_name_description(vdi_ref) + ) + + if ( + volume_metadata.get(NAME_LABEL_TAG) != label or + volume_metadata.get(NAME_DESCRIPTION_TAG) != description + ): + self._linstor.update_volume_metadata(vdi_uuid, { + NAME_LABEL_TAG: label, + NAME_DESCRIPTION_TAG: description + }) + except Exception as e: + raise xs_errors.XenError( + 'MetadataError', + opterr='Error synching SR Metadata and XAPI: {}'.format(e) + ) + + def _synchronize_metadata(self): + if not self._is_master: + return + + util.SMlog('Synchronize metadata...') + if self.cmd == 'sr_attach': + try: + util.SMlog( + 'Synchronize SR metadata and the state on the storage.' + ) + self._synchronize_metadata_and_xapi() + except Exception as e: + util.SMlog('Failed to synchronize metadata: {}'.format(e)) + + # -------------------------------------------------------------------------- + # Stats. + # -------------------------------------------------------------------------- + + def _update_stats(self, virt_alloc_delta): + valloc = int(self.session.xenapi.SR.get_virtual_allocation( + self.sr_ref + )) + + # Update size attributes of the SR parent class. + self.virtual_allocation = valloc + virt_alloc_delta + + # Physical size contains the total physical size. + # i.e. the sum of the sizes of all devices on all hosts, not the AVG. + self._update_physical_size() + + # Notify SR parent class. + self._db_update() + + def _update_physical_size(self): + # Physical size contains the total physical size. + # i.e. the sum of the sizes of all devices on all hosts, not the AVG. + self.physical_size = self._linstor.physical_size + + # `self._linstor.physical_free_size` contains the total physical free + # memory. If Thin provisioning is used we can't use it, we must use + # LINSTOR volume size to gives a good idea of the required + # usable memory to the users. + self.physical_utilisation = self._linstor.total_allocated_volume_size + + # If Thick provisioning is used, we can use this line instead: + # self.physical_utilisation = \ + # self.physical_size - self._linstor.physical_free_size + + # -------------------------------------------------------------------------- + # VDIs. + # -------------------------------------------------------------------------- + + def _load_vdis(self): + if self.vdis: + return + + # 1. Get existing VDIs in XAPI. + xenapi = self.session.xenapi + xapi_vdi_uuids = set() + for vdi in xenapi.SR.get_VDIs(self.sr_ref): + xapi_vdi_uuids.add(xenapi.VDI.get_uuid(vdi)) + + # 2. Get volumes info. + all_volume_info = self._linstor.volumes_with_info + volumes_metadata = self._linstor.volumes_with_metadata + + # 3. Get CBT vdis. + # See: https://support.citrix.com/article/CTX230619 + cbt_vdis = set() + for volume_metadata in volumes_metadata.values(): + cbt_uuid = volume_metadata.get(CBTLOG_TAG) + if cbt_uuid: + cbt_vdis.add(cbt_uuid) + + introduce = False + + if self.cmd == 'sr_scan': + has_clone_entries = list(self._journaler.get_all( + LinstorJournaler.CLONE + ).items()) + + if has_clone_entries: + util.SMlog( + 'Cannot introduce VDIs during scan because it exists ' + 'CLONE entries in journaler on SR {}'.format(self.uuid) + ) + else: + introduce = True + + # 4. Now check all volume info. + vdi_to_snaps = {} + for vdi_uuid, volume_info in all_volume_info.items(): + if vdi_uuid.startswith(cleanup.SR.TMP_RENAME_PREFIX): + continue + + # 4.a. Check if the VDI in LINSTOR is in XAPI VDIs. + if vdi_uuid not in xapi_vdi_uuids: + if not introduce: + continue + + volume_metadata = volumes_metadata.get(vdi_uuid) + if not volume_metadata: + util.SMlog( + 'Skipping volume {} because no metadata could be found' + .format(vdi_uuid) + ) + continue + + util.SMlog( + 'Trying to introduce VDI {} as it is present in ' + 'LINSTOR and not in XAPI...' + .format(vdi_uuid) + ) + + try: + self._linstor.get_device_path(vdi_uuid) + except Exception as e: + util.SMlog( + 'Cannot introduce {}, unable to get path: {}' + .format(vdi_uuid, e) + ) + continue + + name_label = volume_metadata.get(NAME_LABEL_TAG) or '' + type = volume_metadata.get(TYPE_TAG) or 'user' + vdi_type = volume_metadata.get(VDI_TYPE_TAG) + + if not vdi_type: + util.SMlog( + 'Cannot introduce {} '.format(vdi_uuid) + + 'without vdi_type' + ) + continue + + sm_config = { + 'vdi_type': vdi_type + } + + if vdi_type == vhdutil.VDI_TYPE_RAW: + managed = not volume_metadata.get(HIDDEN_TAG) + elif vdi_type == vhdutil.VDI_TYPE_VHD: + vhd_info = self._vhdutil.get_vhd_info(vdi_uuid) + managed = not vhd_info.hidden + if vhd_info.parentUuid: + sm_config['vhd-parent'] = vhd_info.parentUuid + else: + util.SMlog( + 'Cannot introduce {} with invalid VDI type {}' + .format(vdi_uuid, vdi_type) + ) + continue + + util.SMlog( + 'Introducing VDI {} '.format(vdi_uuid) + + ' (name={}, virtual_size={}, physical_size={})'.format( + name_label, + volume_info.virtual_size, + volume_info.physical_size + ) + ) + + vdi_ref = xenapi.VDI.db_introduce( + vdi_uuid, + name_label, + volume_metadata.get(NAME_DESCRIPTION_TAG) or '', + self.sr_ref, + type, + False, # sharable + bool(volume_metadata.get(READ_ONLY_TAG)), + {}, # other_config + vdi_uuid, # location + {}, # xenstore_data + sm_config, + managed, + str(volume_info.virtual_size), + str(volume_info.physical_size) + ) + + is_a_snapshot = volume_metadata.get(IS_A_SNAPSHOT_TAG) + xenapi.VDI.set_is_a_snapshot(vdi_ref, bool(is_a_snapshot)) + if is_a_snapshot: + xenapi.VDI.set_snapshot_time( + vdi_ref, + xmlrpclib.DateTime( + volume_metadata[SNAPSHOT_TIME_TAG] or + '19700101T00:00:00Z' + ) + ) + + snap_uuid = volume_metadata[SNAPSHOT_OF_TAG] + if snap_uuid in vdi_to_snaps: + vdi_to_snaps[snap_uuid].append(vdi_uuid) + else: + vdi_to_snaps[snap_uuid] = [vdi_uuid] + + # 4.b. Add the VDI in the list. + vdi = self.vdi(vdi_uuid) + self.vdis[vdi_uuid] = vdi + + if vdi.vdi_type == vhdutil.VDI_TYPE_VHD: + vdi.sm_config_override['key_hash'] = \ + self._vhdutil.get_key_hash(vdi_uuid) + + # 4.c. Update CBT status of disks either just added + # or already in XAPI. + cbt_uuid = volume_metadata.get(CBTLOG_TAG) + if cbt_uuid in cbt_vdis: + vdi_ref = xenapi.VDI.get_by_uuid(vdi_uuid) + xenapi.VDI.set_cbt_enabled(vdi_ref, True) + # For existing VDIs, update local state too. + # Scan in base class SR updates existing VDIs + # again based on local states. + self.vdis[vdi_uuid].cbt_enabled = True + cbt_vdis.remove(cbt_uuid) + + # 5. Now set the snapshot statuses correctly in XAPI. + for src_uuid in vdi_to_snaps: + try: + src_ref = xenapi.VDI.get_by_uuid(src_uuid) + except Exception: + # The source VDI no longer exists, continue. + continue + + for snap_uuid in vdi_to_snaps[src_uuid]: + try: + # This might fail in cases where its already set. + snap_ref = xenapi.VDI.get_by_uuid(snap_uuid) + xenapi.VDI.set_snapshot_of(snap_ref, src_ref) + except Exception as e: + util.SMlog('Setting snapshot failed: {}'.format(e)) + + # TODO: Check correctly how to use CBT. + # Update cbt_enabled on the right VDI, check LVM/FileSR code. + + # 6. If we have items remaining in this list, + # they are cbt_metadata VDI that XAPI doesn't know about. + # Add them to self.vdis and they'll get added to the DB. + for cbt_uuid in cbt_vdis: + new_vdi = self.vdi(cbt_uuid) + new_vdi.ty = 'cbt_metadata' + new_vdi.cbt_enabled = True + self.vdis[cbt_uuid] = new_vdi + + # 7. Update virtual allocation, build geneology and remove useless VDIs + self.virtual_allocation = 0 + + # 8. Build geneology. + geneology = {} + + for vdi_uuid, vdi in self.vdis.items(): + if vdi.parent: + if vdi.parent in self.vdis: + self.vdis[vdi.parent].read_only = True + if vdi.parent in geneology: + geneology[vdi.parent].append(vdi_uuid) + else: + geneology[vdi.parent] = [vdi_uuid] + if not vdi.hidden: + self.virtual_allocation += vdi.utilisation + + # 9. Remove all hidden leaf nodes to avoid introducing records that + # will be GC'ed. + for vdi_uuid in self.vdis.keys(): + if vdi_uuid not in geneology and self.vdis[vdi_uuid].hidden: + util.SMlog( + 'Scan found hidden leaf ({}), ignoring'.format(vdi_uuid) + ) + del self.vdis[vdi_uuid] + + # -------------------------------------------------------------------------- + # Journals. + # -------------------------------------------------------------------------- + + def _get_vdi_path_and_parent(self, vdi_uuid, volume_name): + try: + device_path = self._linstor.build_device_path(volume_name) + if not util.pathexists(device_path): + return (None, None) + + # If it's a RAW VDI, there is no parent. + volume_metadata = self._linstor.get_volume_metadata(vdi_uuid) + vdi_type = volume_metadata[VDI_TYPE_TAG] + if vdi_type == vhdutil.VDI_TYPE_RAW: + return (device_path, None) + + # Otherwise it's a VHD and a parent can exist. + if not self._vhdutil.check(vdi_uuid): + return (None, None) + + vhd_info = self._vhdutil.get_vhd_info(vdi_uuid) + if vhd_info: + return (device_path, vhd_info.parentUuid) + except Exception as e: + util.SMlog( + 'Failed to get VDI path and parent, ignoring: {}' + .format(e) + ) + return (None, None) + + def _undo_all_journal_transactions(self): + util.SMlog('Undoing all journal transactions...') + self.lock.acquire() + try: + self._handle_interrupted_inflate_ops() + self._handle_interrupted_clone_ops() + pass + finally: + self.lock.release() + + def _handle_interrupted_inflate_ops(self): + transactions = self._journaler.get_all(LinstorJournaler.INFLATE) + for vdi_uuid, old_size in transactions.items(): + self._handle_interrupted_inflate(vdi_uuid, old_size) + self._journaler.remove(LinstorJournaler.INFLATE, vdi_uuid) + + def _handle_interrupted_clone_ops(self): + transactions = self._journaler.get_all(LinstorJournaler.CLONE) + for vdi_uuid, old_size in transactions.items(): + self._handle_interrupted_clone(vdi_uuid, old_size) + self._journaler.remove(LinstorJournaler.CLONE, vdi_uuid) + + def _handle_interrupted_inflate(self, vdi_uuid, old_size): + util.SMlog( + '*** INTERRUPTED INFLATE OP: for {} ({})' + .format(vdi_uuid, old_size) + ) + + vdi = self.vdis.get(vdi_uuid) + if not vdi: + util.SMlog('Cannot deflate missing VDI {}'.format(vdi_uuid)) + return + + current_size = self._linstor.get_volume_info(self.uuid).virtual_size + util.zeroOut( + vdi.path, + current_size - vhdutil.VHD_FOOTER_SIZE, + vhdutil.VHD_FOOTER_SIZE + ) + deflate(vdi_uuid, vdi.path, old_size, current_size) + + def _handle_interrupted_clone( + self, vdi_uuid, clone_info, force_undo=False + ): + util.SMlog( + '*** INTERRUPTED CLONE OP: for {} ({})' + .format(vdi_uuid, clone_info) + ) + + base_uuid, snap_uuid = clone_info.split('_') + + # Use LINSTOR data because new VDIs may not be in the XAPI. + volume_names = self._linstor.volumes_with_name + + # Check if we don't have a base VDI. (If clone failed at startup.) + if base_uuid not in volume_names: + if vdi_uuid in volume_names: + util.SMlog('*** INTERRUPTED CLONE OP: nothing to do') + return + raise util.SMException( + 'Base copy {} not present, but no original {} found' + .format(base_uuid, vdi_uuid) + ) + + if force_undo: + util.SMlog('Explicit revert') + self._undo_clone( + volume_names, vdi_uuid, base_uuid, snap_uuid + ) + return + + # If VDI or snap uuid is missing... + if vdi_uuid not in volume_names or \ + (snap_uuid and snap_uuid not in volume_names): + util.SMlog('One or both leaves missing => revert') + self._undo_clone(volume_names, vdi_uuid, base_uuid, snap_uuid) + return + + vdi_path, vdi_parent_uuid = self._get_vdi_path_and_parent( + vdi_uuid, volume_names[vdi_uuid] + ) + snap_path, snap_parent_uuid = self._get_vdi_path_and_parent( + snap_uuid, volume_names[snap_uuid] + ) + + if not vdi_path or (snap_uuid and not snap_path): + util.SMlog('One or both leaves invalid (and path(s)) => revert') + self._undo_clone(volume_names, vdi_uuid, base_uuid, snap_uuid) + return + + util.SMlog('Leaves valid but => revert') + self._undo_clone(volume_names, vdi_uuid, base_uuid, snap_uuid) + + def _undo_clone(self, volume_names, vdi_uuid, base_uuid, snap_uuid): + base_path = self._linstor.build_device_path(volume_names[base_uuid]) + base_metadata = self._linstor.get_volume_metadata(base_uuid) + base_type = base_metadata[VDI_TYPE_TAG] + + if not util.pathexists(base_path): + util.SMlog('Base not found! Exit...') + util.SMlog('*** INTERRUPTED CLONE OP: rollback fail') + return + + # Un-hide the parent. + self._linstor.update_volume_metadata(base_uuid, {READ_ONLY_TAG: False}) + if base_type == vhdutil.VDI_TYPE_VHD: + vhd_info = self._vhdutil.get_vhd_info(base_uuid, False) + if vhd_info.hidden: + vhdutil.setHidden(base_path, False) + elif base_type == vhdutil.VDI_TYPE_RAW and \ + base_metadata.get(HIDDEN_TAG): + self._linstor.update_volume_metadata( + base_uuid, {HIDDEN_TAG: False} + ) + + # Remove the child nodes. + if snap_uuid and snap_uuid in volume_names: + util.SMlog('Destroying snap {}...'.format(snap_uuid)) + snap_metadata = self._linstor.get_volume_metadata(snap_uuid) + + if snap_metadata.get(VDI_TYPE_TAG) != vhdutil.VDI_TYPE_VHD: + raise util.SMException('Clone {} not VHD'.format(snap_uuid)) + + try: + self._linstor.destroy_volume(snap_uuid) + except Exception as e: + util.SMlog( + 'Cannot destroy snap {} during undo clone: {}' + .format(snap_uuid, e) + ) + + if vdi_uuid in volume_names: + try: + util.SMlog('Destroying {}...'.format(vdi_uuid)) + self._linstor.destroy_volume(vdi_uuid) + except Exception as e: + util.SMlog( + 'Cannot destroy VDI {} during undo clone: {}' + .format(vdi_uuid, e) + ) + # We can get an exception like this: + # "Shutdown of the DRBD resource 'XXX failed", so the + # volume info remains... The problem is we can't rename + # properly the base VDI below this line, so we must change the + # UUID of this bad VDI before. + self._linstor.update_volume_uuid( + vdi_uuid, 'DELETED_' + vdi_uuid, force=True + ) + + # Rename! + self._linstor.update_volume_uuid(base_uuid, vdi_uuid) + + # Inflate to the right size. + if base_type == vhdutil.VDI_TYPE_VHD: + vdi = self.vdi(vdi_uuid) + volume_size = compute_volume_size(vdi.size, vdi.vdi_type) + inflate( + self._journaler, self._linstor, vdi_uuid, vdi.path, + volume_size, vdi.capacity + ) + self.vdis[vdi_uuid] = vdi + + # At this stage, tapdisk and SM vdi will be in paused state. Remove + # flag to facilitate vm deactivate. + vdi_ref = self.session.xenapi.VDI.get_by_uuid(vdi_uuid) + self.session.xenapi.VDI.remove_from_sm_config(vdi_ref, 'paused') + + util.SMlog('*** INTERRUPTED CLONE OP: rollback success') + + # -------------------------------------------------------------------------- + # Misc. + # -------------------------------------------------------------------------- + + def _ensure_space_available(self, amount_needed): + space_available = self._linstor.max_volume_size_allowed + if (space_available < amount_needed): + util.SMlog( + 'Not enough space! Free space: {}, need: {}'.format( + space_available, amount_needed + ) + ) + raise xs_errors.XenError('SRNoSpace') + + def _kick_gc(self): + # Don't bother if an instance already running. This is just an + # optimization to reduce the overhead of forking a new process if we + # don't have to, but the process will check the lock anyways. + lock = Lock(cleanup.LOCK_TYPE_RUNNING, self.uuid) + if not lock.acquireNoblock(): + if not cleanup.should_preempt(self.session, self.uuid): + util.SMlog('A GC instance already running, not kicking') + return + + util.SMlog('Aborting currently-running coalesce of garbage VDI') + try: + if not cleanup.abort(self.uuid, soft=True): + util.SMlog('The GC has already been scheduled to re-start') + except util.CommandException as e: + if e.code != errno.ETIMEDOUT: + raise + util.SMlog('Failed to abort the GC') + else: + lock.release() + + util.SMlog('Kicking GC') + cleanup.gc(self.session, self.uuid, True) + +# ============================================================================== +# LinstorSr VDI +# ============================================================================== + + +class LinstorVDI(VDI.VDI): + # Warning: Not the same values than vhdutil.VDI_TYPE_*. + # These values represents the types given on the command line. + TYPE_RAW = 'raw' + TYPE_VHD = 'vhd' + + MAX_SIZE = 2 * 1024 * 1024 * 1024 * 1024 # Max VHD size. + + # Metadata size given to the "S" param of vhd-util create. + # "-S size (MB) for metadata preallocation". + # Increase the performance when resize is called. + MAX_METADATA_VIRT_SIZE = 2 * 1024 * 1024 + + # -------------------------------------------------------------------------- + # VDI methods. + # -------------------------------------------------------------------------- + + def load(self, vdi_uuid): + self._lock = self.sr.lock + self._exists = True + self._linstor = self.sr._linstor + + # Update hidden parent property. + self.hidden = False + + def raise_bad_load(e): + util.SMlog( + 'Got exception in LinstorVDI.load: {}'.format(e) + ) + util.SMlog(traceback.format_exc()) + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='Could not load {} because: {}'.format(self.uuid, e) + ) + + # Try to load VDI. + try: + if ( + self.sr.srcmd.cmd == 'vdi_attach_from_config' or + self.sr.srcmd.cmd == 'vdi_detach_from_config' + ) and self.sr.srcmd.params['vdi_uuid'] == self.uuid: + self.vdi_type = vhdutil.VDI_TYPE_RAW + self.path = self.sr.srcmd.params['vdi_path'] + else: + self._determine_type_and_path() + self._load_this() + + util.SMlog('VDI {} loaded! (path={}, hidden={})'.format( + self.uuid, self.path, self.hidden + )) + except LinstorVolumeManagerError as e: + # 1. It may be a VDI deletion. + if e.code == LinstorVolumeManagerError.ERR_VOLUME_NOT_EXISTS: + if self.sr.srcmd.cmd == 'vdi_delete': + self.deleted = True + return + + # 2. Or maybe a creation. + if self.sr.srcmd.cmd == 'vdi_create': + # Set type attribute of VDI parent class. + # We use VHD by default. + self.vdi_type = vhdutil.VDI_TYPE_VHD + self._key_hash = None # Only used in create. + + self._exists = False + vdi_sm_config = self.sr.srcmd.params.get('vdi_sm_config') + if vdi_sm_config is not None: + type = vdi_sm_config.get('type') + if type is not None: + if type == self.TYPE_RAW: + self.vdi_type = vhdutil.VDI_TYPE_RAW + elif type == self.TYPE_VHD: + self.vdi_type = vhdutil.VDI_TYPE_VHD + else: + raise xs_errors.XenError( + 'VDICreate', + opterr='Invalid VDI type {}'.format(type) + ) + if self.vdi_type == vhdutil.VDI_TYPE_VHD: + self._key_hash = vdi_sm_config.get('key_hash') + + # For the moment we don't have a path. + self._update_device_name(None) + return + raise_bad_load(e) + except Exception as e: + raise_bad_load(e) + + def create(self, sr_uuid, vdi_uuid, size): + # Usage example: + # xe vdi-create sr-uuid=39a5826b-5a90-73eb-dd09-51e3a116f937 + # name-label="linstor-vdi-1" virtual-size=4096MiB sm-config:type=vhd + + # 1. Check if we are on the master and if the VDI doesn't exist. + util.SMlog('LinstorVDI.create for {}'.format(self.uuid)) + if self._exists: + raise xs_errors.XenError('VDIExists') + + assert self.uuid + assert self.ty + assert self.vdi_type + + # 2. Compute size and check space available. + size = vhdutil.validate_and_round_vhd_size(long(size)) + util.SMlog('LinstorVDI.create: type={}, size={}'.format( + self.vdi_type, size + )) + + volume_size = compute_volume_size(size, self.vdi_type) + self.sr._ensure_space_available(volume_size) + + # 3. Set sm_config attribute of VDI parent class. + self.sm_config = self.sr.srcmd.params['vdi_sm_config'] + + # 4. Create! + failed = False + try: + self._linstor.create_volume( + self.uuid, volume_size, persistent=False + ) + volume_info = self._linstor.get_volume_info(self.uuid) + + self._update_device_name(volume_info.name) + + if self.vdi_type == vhdutil.VDI_TYPE_RAW: + self.size = volume_info.virtual_size + else: + vhdutil.create( + self.path, size, False, self.MAX_METADATA_VIRT_SIZE + ) + self.size = self.sr._vhdutil.get_size_virt(self.uuid) + + if self._key_hash: + vhdutil.setKey(self.path, self._key_hash) + + # Because vhdutil commands modify the volume data, + # we must retrieve a new time the utilisation size. + volume_info = self._linstor.get_volume_info(self.uuid) + + volume_metadata = { + NAME_LABEL_TAG: util.to_plain_string(self.label), + NAME_DESCRIPTION_TAG: util.to_plain_string(self.description), + IS_A_SNAPSHOT_TAG: False, + SNAPSHOT_OF_TAG: '', + SNAPSHOT_TIME_TAG: '', + TYPE_TAG: self.ty, + VDI_TYPE_TAG: self.vdi_type, + READ_ONLY_TAG: bool(self.read_only), + METADATA_OF_POOL_TAG: '' + } + self._linstor.set_volume_metadata(self.uuid, volume_metadata) + self._linstor.mark_volume_as_persistent(self.uuid) + except util.CommandException as e: + failed = True + raise xs_errors.XenError( + 'VDICreate', opterr='error {}'.format(e.code) + ) + except Exception as e: + failed = True + raise xs_errors.XenError('VDICreate', opterr='error {}'.format(e)) + finally: + if failed: + util.SMlog('Unable to create VDI {}'.format(self.uuid)) + try: + self._linstor.destroy_volume(self.uuid) + except Exception as e: + util.SMlog( + 'Ignoring exception after fail in LinstorVDI.create: ' + '{}'.format(e) + ) + + self.utilisation = volume_info.physical_size + self.sm_config['vdi_type'] = self.vdi_type + + self.ref = self._db_introduce() + self.sr._update_stats(volume_info.virtual_size) + + return VDI.VDI.get_params(self) + + def delete(self, sr_uuid, vdi_uuid, data_only=False): + util.SMlog('LinstorVDI.delete for {}'.format(self.uuid)) + if self.attached: + raise xs_errors.XenError('VDIInUse') + + if self.deleted: + return super(LinstorVDI, self).delete( + sr_uuid, vdi_uuid, data_only + ) + + vdi_ref = self.sr.srcmd.params['vdi_ref'] + if not self.session.xenapi.VDI.get_managed(vdi_ref): + raise xs_errors.XenError( + 'VDIDelete', + opterr='Deleting non-leaf node not permitted' + ) + + try: + # Remove from XAPI and delete from LINSTOR. + self._linstor.destroy_volume(self.uuid) + if not data_only: + self._db_forget() + + self.sr.lock.cleanupAll(vdi_uuid) + except Exception as e: + util.SMlog( + 'Failed to remove the volume (maybe is leaf coalescing) ' + 'for {} err: {}'.format(self.uuid, e) + ) + raise xs_errors.XenError('VDIDelete', opterr=str(e)) + + if self.uuid in self.sr.vdis: + del self.sr.vdis[self.uuid] + + # TODO: Check size after delete. + self.sr._update_stats(-self.capacity) + self.sr._kick_gc() + return super(LinstorVDI, self).delete(sr_uuid, vdi_uuid, data_only) + + def attach(self, sr_uuid, vdi_uuid): + util.SMlog('LinstorVDI.attach for {}'.format(self.uuid)) + if ( + self.sr.srcmd.cmd != 'vdi_attach_from_config' or + self.sr.srcmd.params['vdi_uuid'] != self.uuid + ) and self.sr._journaler.has_entries(self.uuid): + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='Interrupted operation detected on this VDI, ' + 'scan SR first to trigger auto-repair' + ) + + writable = 'args' not in self.sr.srcmd.params or \ + self.sr.srcmd.params['args'][0] == 'true' + + # We need to inflate the volume if we don't have enough place + # to mount the VHD image. I.e. the volume capacity must be greater + # than the VHD size + bitmap size. + need_inflate = True + if self.vdi_type == vhdutil.VDI_TYPE_RAW or not writable or \ + self.capacity >= compute_volume_size(self.size, self.vdi_type): + need_inflate = False + + if need_inflate: + try: + self._prepare_thin(True) + except Exception as e: + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='Failed to attach VDI during "prepare thin": {}' + .format(e) + ) + + if not util.pathexists(self.path): + raise xs_errors.XenError( + 'VDIUnavailable', opterr='Could not find: {}'.format(self.path) + ) + + if not hasattr(self, 'xenstore_data'): + self.xenstore_data = {} + + # TODO: Is it useful? + self.xenstore_data.update(scsiutil.update_XS_SCSIdata( + self.uuid, scsiutil.gen_synthetic_page_data(self.uuid) + )) + + self.xenstore_data['storage-type'] = LinstorSR.DRIVER_TYPE + + self.attached = True + + return VDI.VDI.attach(self, self.sr.uuid, self.uuid) + + def detach(self, sr_uuid, vdi_uuid): + util.SMlog('LinstorVDI.detach for {}'.format(self.uuid)) + self.attached = False + + if self.vdi_type == vhdutil.VDI_TYPE_RAW: + return + + # The VDI is already deflated if the VHD image size + metadata is + # equal to the LINSTOR volume size. + volume_size = compute_volume_size(self.size, self.vdi_type) + already_deflated = self.capacity <= volume_size + + if already_deflated: + util.SMlog( + 'VDI {} already deflated (old volume size={}, volume size={})' + .format(self.uuid, self.capacity, volume_size) + ) + + need_deflate = True + if already_deflated: + need_deflate = False + elif self.sr._provisioning == 'thick': + need_deflate = False + + vdi_ref = self.sr.srcmd.params['vdi_ref'] + if self.session.xenapi.VDI.get_is_a_snapshot(vdi_ref): + need_deflate = True + + if need_deflate: + try: + self._prepare_thin(False) + except Exception as e: + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='Failed to detach VDI during "prepare thin": {}' + .format(e) + ) + + def resize(self, sr_uuid, vdi_uuid, size): + util.SMlog('LinstorVDI.resize for {}'.format(self.uuid)) + if self.hidden: + raise xs_errors.XenError('VDIUnavailable', opterr='hidden VDI') + + if size < self.size: + util.SMlog( + 'vdi_resize: shrinking not supported: ' + '(current size: {}, new size: {})'.format(self.size, size) + ) + raise xs_errors.XenError('VDISize', opterr='shrinking not allowed') + + # Compute the virtual VHD size. + size = vhdutil.validate_and_round_vhd_size(long(size)) + + if size == self.size: + return VDI.VDI.get_params(self) + + # Compute the LINSTOR volume size. + new_volume_size = compute_volume_size(size, self.vdi_type) + if self.vdi_type == vhdutil.VDI_TYPE_RAW: + old_volume_size = self.size + else: + old_volume_size = self.capacity + if self.sr._provisioning == 'thin': + # VDI is currently deflated, so keep it deflated. + new_volume_size = old_volume_size + assert new_volume_size >= old_volume_size + + space_needed = new_volume_size - old_volume_size + self.sr._ensure_space_available(space_needed) + + old_capacity = self.capacity + if self.vdi_type == vhdutil.VDI_TYPE_RAW: + self._linstor.resize(self.uuid, new_volume_size) + else: + if new_volume_size != old_volume_size: + inflate( + self.sr._journaler, self._linstor, self.uuid, self.path, + new_volume_size, old_volume_size + ) + vhdutil.setSizeVirtFast(self.path, size) + + # Reload size attributes. + self._load_this() + + vdi_ref = self.sr.srcmd.params['vdi_ref'] + self.session.xenapi.VDI.set_virtual_size(vdi_ref, str(self.size)) + self.session.xenapi.VDI.set_physical_utilisation( + vdi_ref, str(self.utilisation) + ) + self.sr._update_stats(self.capacity - old_capacity) + return VDI.VDI.get_params(self) + + def clone(self, sr_uuid, vdi_uuid): + return self._do_snapshot(sr_uuid, vdi_uuid, VDI.SNAPSHOT_DOUBLE) + + def compose(self, sr_uuid, vdi1, vdi2): + util.SMlog('VDI.compose for {} -> {}'.format(vdi2, vdi1)) + if self.vdi_type != vhdutil.VDI_TYPE_VHD: + raise xs_errors.XenError('Unimplemented') + + parent_uuid = vdi1 + parent_path = self._linstor.get_device_path(parent_uuid) + + # We must pause tapdisk to correctly change the parent. Otherwise we + # have a readonly error. + # See: https://github.com/xapi-project/xen-api/blob/b3169a16d36dae0654881b336801910811a399d9/ocaml/xapi/storage_migrate.ml#L928-L929 + # and: https://github.com/xapi-project/xen-api/blob/b3169a16d36dae0654881b336801910811a399d9/ocaml/xapi/storage_migrate.ml#L775 + + if not blktap2.VDI.tap_pause(self.session, self.sr.uuid, self.uuid): + raise util.SMException('Failed to pause VDI {}'.format(self.uuid)) + try: + vhdutil.setParent(self.path, parent_path, False) + vhdutil.setHidden(parent_path) + self.sr.session.xenapi.VDI.set_managed( + self.sr.srcmd.params['args'][0], False + ) + finally: + blktap2.VDI.tap_unpause(self.session, self.sr.uuid, self.uuid) + + if not blktap2.VDI.tap_refresh(self.session, self.sr.uuid, self.uuid): + raise util.SMException( + 'Failed to refresh VDI {}'.format(self.uuid) + ) + + util.SMlog('Compose done') + + def generate_config(self, sr_uuid, vdi_uuid): + """ + Generate the XML config required to attach and activate + a VDI for use when XAPI is not running. Attach and + activation is handled by vdi_attach_from_config below. + """ + + util.SMlog('LinstorVDI.generate_config for {}'.format(self.uuid)) + + if not self.path or not util.pathexists(self.path): + available = False + # Try to refresh symlink path... + try: + self.path = self._linstor.get_device_path(vdi_uuid) + available = util.pathexists(self.path) + except Exception: + pass + if not available: + raise xs_errors.XenError('VDIUnavailable') + + resp = {} + resp['device_config'] = self.sr.dconf + resp['sr_uuid'] = sr_uuid + resp['vdi_uuid'] = self.uuid + resp['sr_sm_config'] = self.sr.sm_config + resp['vdi_path'] = self.path + resp['command'] = 'vdi_attach_from_config' + + config = xmlrpclib.dumps(tuple([resp]), 'vdi_attach_from_config') + return xmlrpclib.dumps((config,), "", True) + + def attach_from_config(self, sr_uuid, vdi_uuid): + """ + Attach and activate a VDI using config generated by + vdi_generate_config above. This is used for cases such as + the HA state-file and the redo-log. + """ + + util.SMlog('LinstorVDI.attach_from_config for {}'.format(vdi_uuid)) + + try: + if not util.pathexists(self.sr.path): + self.sr.attach(sr_uuid) + + if not DRIVER_CONFIG['ATTACH_FROM_CONFIG_WITH_TAPDISK']: + return self.attach(sr_uuid, vdi_uuid) + except Exception: + util.logException('LinstorVDI.attach_from_config') + raise xs_errors.XenError( + 'SRUnavailable', + opterr='Unable to attach from config' + ) + + def reset_leaf(self, sr_uuid, vdi_uuid): + if self.vdi_type != vhdutil.VDI_TYPE_VHD: + raise xs_errors.XenError('Unimplemented') + + if not self.sr._vhdutil.has_parent(self.uuid): + raise util.SMException( + 'ERROR: VDI {} has no parent, will not reset contents' + .format(self.uuid) + ) + + vhdutil.killData(self.path) + + def _load_this(self): + volume_metadata = self._linstor.get_volume_metadata(self.uuid) + volume_info = self._linstor.get_volume_info(self.uuid) + + # Contains the physical size used on all disks. + # When LINSTOR LVM driver is used, the size should be similar to + # virtual size (i.e. the LINSTOR max volume size). + # When LINSTOR Thin LVM driver is used, the used physical size should + # be lower than virtual size at creation. + # The physical size increases after each write in a new block. + self.utilisation = volume_info.physical_size + self.capacity = volume_info.virtual_size + + if self.vdi_type == vhdutil.VDI_TYPE_RAW: + self.hidden = int(volume_metadata.get(HIDDEN_TAG) or 0) + self.size = volume_info.virtual_size + self.parent = '' + else: + vhd_info = self.sr._vhdutil.get_vhd_info(self.uuid) + self.hidden = vhd_info.hidden + self.size = vhd_info.sizeVirt + self.parent = vhd_info.parentUuid + + if self.hidden: + self.managed = False + + self.label = volume_metadata.get(NAME_LABEL_TAG) or '' + self.description = volume_metadata.get(NAME_DESCRIPTION_TAG) or '' + + # Update sm_config_override of VDI parent class. + self.sm_config_override = {'vhd-parent': self.parent or None} + + def _mark_hidden(self, hidden=True): + if self.hidden == hidden: + return + + if self.vdi_type == vhdutil.VDI_TYPE_VHD: + vhdutil.setHidden(self.path, hidden) + else: + self._linstor.update_volume_metadata(self.uuid, { + HIDDEN_TAG: hidden + }) + self.hidden = hidden + + def update(self, sr_uuid, vdi_uuid): + xenapi = self.session.xenapi + vdi_ref = xenapi.VDI.get_by_uuid(self.uuid) + + volume_metadata = { + NAME_LABEL_TAG: util.to_plain_string( + xenapi.VDI.get_name_label(vdi_ref) + ), + NAME_DESCRIPTION_TAG: util.to_plain_string( + xenapi.VDI.get_name_description(vdi_ref) + ) + } + + try: + self._linstor.update_volume_metadata(self.uuid, volume_metadata) + except LinstorVolumeManagerError as e: + if e.code == LinstorVolumeManagerError.ERR_VOLUME_NOT_EXISTS: + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='LINSTOR volume {} not found'.format(self.uuid) + ) + raise xs_errors.XenError('VDIUnavailable', opterr=str(e)) + + # -------------------------------------------------------------------------- + # Thin provisioning. + # -------------------------------------------------------------------------- + + def _prepare_thin(self, attach): + if self.sr._is_master: + if attach: + attach_thin( + self.session, self.sr._journaler, self._linstor, + self.sr.uuid, self.uuid + ) + else: + detach_thin( + self.session, self._linstor, self.sr.uuid, self.uuid + ) + else: + fn = 'attach' if attach else 'detach' + + # We assume the first pool is always the one currently in use. + pools = self.session.xenapi.pool.get_all() + master = self.session.xenapi.pool.get_master(pools[0]) + args = { + 'groupName': self.sr._group_name, + 'srUuid': self.sr.uuid, + 'vdiUuid': self.uuid + } + ret = self.session.xenapi.host.call_plugin( + master, self.sr.MANAGER_PLUGIN, fn, args + ) + util.SMlog( + 'call-plugin ({} with {}) returned: {}'.format(fn, args, ret) + ) + if ret == 'False': + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='Plugin {} failed'.format(self.sr.MANAGER_PLUGIN) + ) + + # Reload size attrs after inflate or deflate! + self._load_this() + self.sr._update_physical_size() + + vdi_ref = self.sr.srcmd.params['vdi_ref'] + self.session.xenapi.VDI.set_physical_utilisation( + vdi_ref, str(self.utilisation) + ) + + self.session.xenapi.SR.set_physical_utilisation( + self.sr.sr_ref, str(self.sr.physical_utilisation) + ) + + # -------------------------------------------------------------------------- + # Generic helpers. + # -------------------------------------------------------------------------- + + def _determine_type_and_path(self): + """ + Determine whether this is a RAW or a VHD VDI. + """ + + # 1. Check vdi_ref and vdi_type in config. + try: + vdi_ref = self.session.xenapi.VDI.get_by_uuid(self.uuid) + if vdi_ref: + sm_config = self.session.xenapi.VDI.get_sm_config(vdi_ref) + vdi_type = sm_config.get('vdi_type') + if vdi_type: + # Update parent fields. + self.vdi_type = vdi_type + self.sm_config_override = sm_config + self._update_device_name( + self._linstor.get_volume_name(self.uuid) + ) + return + except Exception: + pass + + # 2. Otherwise use the LINSTOR volume manager directly. + # It's probably a new VDI created via snapshot. + volume_metadata = self._linstor.get_volume_metadata(self.uuid) + self.vdi_type = volume_metadata.get(VDI_TYPE_TAG) + if not self.vdi_type: + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='failed to get vdi_type in metadata' + ) + self._update_device_name( + self._linstor.get_volume_name(self.uuid) + ) + + def _update_device_name(self, device_name): + self._device_name = device_name + + # Mark path of VDI parent class. + if device_name: + self.path = self._linstor.build_device_path(self._device_name) + else: + self.path = None + + def _create_snapshot(self, snap_uuid, snap_of_uuid=None): + """ + Snapshot self and return the snapshot VDI object. + """ + + # 1. Create a new LINSTOR volume with the same size than self. + snap_path = self._linstor.shallow_clone_volume( + self.uuid, snap_uuid, persistent=False + ) + + # 2. Write the snapshot content. + is_raw = (self.vdi_type == vhdutil.VDI_TYPE_RAW) + vhdutil.snapshot( + snap_path, self.path, is_raw, self.MAX_METADATA_VIRT_SIZE + ) + + # 3. Get snapshot parent. + snap_parent = self.sr._vhdutil.get_parent(snap_uuid) + + # 4. Update metadata. + util.SMlog('Set VDI {} metadata of snapshot'.format(snap_uuid)) + volume_metadata = { + NAME_LABEL_TAG: util.to_plain_string(self.label), + NAME_DESCRIPTION_TAG: util.to_plain_string(self.description), + IS_A_SNAPSHOT_TAG: bool(snap_of_uuid), + SNAPSHOT_OF_TAG: snap_of_uuid, + SNAPSHOT_TIME_TAG: '', + TYPE_TAG: self.ty, + VDI_TYPE_TAG: vhdutil.VDI_TYPE_VHD, + READ_ONLY_TAG: False, + METADATA_OF_POOL_TAG: '' + } + self._linstor.set_volume_metadata(snap_uuid, volume_metadata) + + # 5. Set size. + snap_vdi = LinstorVDI(self.sr, snap_uuid) + if not snap_vdi._exists: + raise xs_errors.XenError('VDISnapshot') + + volume_info = self._linstor.get_volume_info(snap_uuid) + + snap_vdi.size = self.sr._vhdutil.get_size_virt(snap_uuid) + snap_vdi.utilisation = volume_info.physical_size + + # 6. Update sm config. + snap_vdi.sm_config = {} + snap_vdi.sm_config['vdi_type'] = snap_vdi.vdi_type + if snap_parent: + snap_vdi.sm_config['vhd-parent'] = snap_parent + snap_vdi.parent = snap_parent + + snap_vdi.label = self.label + snap_vdi.description = self.description + + self._linstor.mark_volume_as_persistent(snap_uuid) + + return snap_vdi + + # -------------------------------------------------------------------------- + # Implement specific SR methods. + # -------------------------------------------------------------------------- + + def _rename(self, oldpath, newpath): + # TODO: I'm not sure... Used by CBT. + volume_uuid = self._linstor.get_volume_uuid_from_device_path(oldpath) + self._linstor.update_volume_name(volume_uuid, newpath) + + def _do_snapshot( + self, sr_uuid, vdi_uuid, snap_type, secondary=None, cbtlog=None + ): + # If cbt enabled, save file consistency state. + if cbtlog is not None: + if blktap2.VDI.tap_status(self.session, vdi_uuid): + consistency_state = False + else: + consistency_state = True + util.SMlog( + 'Saving log consistency state of {} for vdi: {}' + .format(consistency_state, vdi_uuid) + ) + else: + consistency_state = None + + if self.vdi_type != vhdutil.VDI_TYPE_VHD: + raise xs_errors.XenError('Unimplemented') + + if not blktap2.VDI.tap_pause(self.session, sr_uuid, vdi_uuid): + raise util.SMException('Failed to pause VDI {}'.format(vdi_uuid)) + try: + return self._snapshot(snap_type, cbtlog, consistency_state) + finally: + blktap2.VDI.tap_unpause(self.session, sr_uuid, vdi_uuid, secondary) + + def _snapshot(self, snap_type, cbtlog=None, cbt_consistency=None): + util.SMlog( + 'LinstorVDI._snapshot for {} (type {})' + .format(self.uuid, snap_type) + ) + + # 1. Checks... + if self.hidden: + raise xs_errors.XenError('VDIClone', opterr='hidden VDI') + + depth = self.sr._vhdutil.get_depth(self.uuid) + if depth == -1: + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='failed to get VHD depth' + ) + elif depth >= vhdutil.MAX_CHAIN_SIZE: + raise xs_errors.XenError('SnapshotChainTooLong') + + volume_path = self.path + if not util.pathexists(volume_path): + raise xs_errors.XenError( + 'EIO', + opterr='IO error checking path {}'.format(volume_path) + ) + + # 2. Create base and snap uuid (if required) and a journal entry. + base_uuid = util.gen_uuid() + snap_uuid = None + + if snap_type == VDI.SNAPSHOT_DOUBLE: + snap_uuid = util.gen_uuid() + + clone_info = '{}_{}'.format(base_uuid, snap_uuid) + + active_uuid = self.uuid + self.sr._journaler.create( + LinstorJournaler.CLONE, active_uuid, clone_info + ) + + try: + # 3. Self becomes the new base. + # The device path remains the same. + self._linstor.update_volume_uuid(self.uuid, base_uuid) + self.uuid = base_uuid + self.location = self.uuid + self.read_only = True + self.managed = False + + # 4. Create snapshots (new active and snap). + active_vdi = self._create_snapshot(active_uuid) + + snap_vdi = None + if snap_type == VDI.SNAPSHOT_DOUBLE: + snap_vdi = self._create_snapshot(snap_uuid, active_uuid) + + self.label = 'base copy' + self.description = '' + + # 5. Mark the base VDI as hidden so that it does not show up + # in subsequent scans. + self._mark_hidden() + self._linstor.update_volume_metadata( + self.uuid, {READ_ONLY_TAG: True} + ) + + # 6. We must update the new active VDI with the "paused" and + # "host_" properties. Why? Because the original VDI has been + # paused and we we must unpause it after the snapshot. + # See: `tap_unpause` in `blktap2.py`. + vdi_ref = self.session.xenapi.VDI.get_by_uuid(active_uuid) + sm_config = self.session.xenapi.VDI.get_sm_config(vdi_ref) + for key in filter( + lambda x: x == 'paused' or x.startswith('host_'), + sm_config.keys() + ): + active_vdi.sm_config[key] = sm_config[key] + + # 7. Verify parent locator field of both children and + # delete base if unused. + introduce_parent = True + try: + snap_parent = None + if snap_vdi: + snap_parent = snap_vdi.parent + + if active_vdi.parent != self.uuid and ( + snap_type == VDI.SNAPSHOT_SINGLE or + snap_type == VDI.SNAPSHOT_INTERNAL or + snap_parent != self.uuid + ): + util.SMlog( + 'Destroy unused base volume: {} (path={})' + .format(self.uuid, self.path) + ) + introduce_parent = False + self._linstor.destroy_volume(self.uuid) + except Exception as e: + util.SMlog('Ignoring exception: {}'.format(e)) + pass + + # 8. Introduce the new VDI records. + if snap_vdi: + # If the parent is encrypted set the key_hash for the + # new snapshot disk. + vdi_ref = self.sr.srcmd.params['vdi_ref'] + sm_config = self.session.xenapi.VDI.get_sm_config(vdi_ref) + # TODO: Maybe remove key_hash support. + if 'key_hash' in sm_config: + snap_vdi.sm_config['key_hash'] = sm_config['key_hash'] + # If we have CBT enabled on the VDI, + # set CBT status for the new snapshot disk. + if cbtlog: + snap_vdi.cbt_enabled = True + + if snap_vdi: + snap_vdi_ref = snap_vdi._db_introduce() + util.SMlog( + 'vdi_clone: introduced VDI: {} ({})' + .format(snap_vdi_ref, snap_vdi.uuid) + ) + if introduce_parent: + base_vdi_ref = self._db_introduce() + self.session.xenapi.VDI.set_managed(base_vdi_ref, False) + util.SMlog( + 'vdi_clone: introduced VDI: {} ({})' + .format(base_vdi_ref, self.uuid) + ) + self._linstor.update_volume_metadata(self.uuid, { + NAME_LABEL_TAG: util.to_plain_string(self.label), + NAME_DESCRIPTION_TAG: util.to_plain_string( + self.description + ), + READ_ONLY_TAG: True, + METADATA_OF_POOL_TAG: '' + }) + + # 9. Update cbt files if user created snapshot (SNAPSHOT_DOUBLE) + if snap_type == VDI.SNAPSHOT_DOUBLE and cbtlog: + try: + self._cbt_snapshot(snap_uuid, cbt_consistency) + except Exception: + # CBT operation failed. + # TODO: Implement me. + raise + + if snap_type != VDI.SNAPSHOT_INTERNAL: + self.sr._update_stats(self.capacity) + + # 10. Return info on the new user-visible leaf VDI. + ret_vdi = snap_vdi + if not ret_vdi: + ret_vdi = self + if not ret_vdi: + ret_vdi = active_vdi + + vdi_ref = self.sr.srcmd.params['vdi_ref'] + self.session.xenapi.VDI.set_sm_config( + vdi_ref, active_vdi.sm_config + ) + except Exception as e: + util.logException('Failed to snapshot!') + try: + self.sr._handle_interrupted_clone( + active_uuid, clone_info, force_undo=True + ) + self.sr._journaler.remove(LinstorJournaler.CLONE, active_uuid) + except Exception as e: + util.SMlog( + 'WARNING: Failed to clean up failed snapshot: {}' + .format(e) + ) + raise xs_errors.XenError('VDIClone', opterr=str(e)) + + self.sr._journaler.remove(LinstorJournaler.CLONE, active_uuid) + + return ret_vdi.get_params() + +# ------------------------------------------------------------------------------ + + +if __name__ == '__main__': + SRCommand.run(LinstorSR, DRIVER_INFO) +else: + SR.registerSR(LinstorSR) diff --git a/drivers/XE_SR_ERRORCODES.xml b/drivers/XE_SR_ERRORCODES.xml index 1f58adca..fa87109a 100755 --- a/drivers/XE_SR_ERRORCODES.xml +++ b/drivers/XE_SR_ERRORCODES.xml @@ -898,4 +898,40 @@ ZFS SR deletion error 5001 + + + LinstorMaster + Linstor request must come from master + 5002 + + + + LinstorConfigHostsMissing + The request is missing the LINSTOR hosts parameter + 5003 + + + + LinstorConfigGroupNameMissing + The request is missing the LINSTOR group name parameter + 5004 + + + + LinstorConfigRedundancyMissing + The request is missing the LINSTOR redundancy parameter + 5005 + + + + LinstorSRCreate + LINSTOR SR creation error + 5006 + + + + LinstorSRDelete + LINSTOR SR delete error + 5007 + diff --git a/drivers/cleanup.py b/drivers/cleanup.py index 867c7896..a60c9877 100755 --- a/drivers/cleanup.py +++ b/drivers/cleanup.py @@ -47,7 +47,17 @@ from refcounter import RefCounter from ipc import IPCFlag from lvmanager import LVActivator -from srmetadata import LVMMetadataHandler +from srmetadata import LVMMetadataHandler, VDI_TYPE_TAG + +try: + from linstorjournaler import LinstorJournaler + from linstorvhdutil import LinstorVhdUtil + from linstorvolumemanager \ + import LinstorVolumeManager, LinstorVolumeManagerError + LINSTOR_AVAILABLE = True +except ImportError: + LINSTOR_AVAILABLE = False + # Disable automatic leaf-coalescing. Online leaf-coalesce is currently not # possible due to lvhd_stop_using_() not working correctly. However, we leave @@ -643,7 +653,19 @@ def getAllPrunable(self): if child not in childList: thisPrunable = False - if not self.scanError and thisPrunable: + # We can destroy the current VDI if all childs are hidden BUT the + # current VDI must be hidden too to do that! + # Example in this case (after a failed live leaf coalesce): + # + # SMGC: [32436] SR 07ed ('linstor-nvme-sr') (2 VDIs in 1 VHD trees): + # SMGC: [32436] b5458d61(1.000G/4.127M) + # SMGC: [32436] *OLD_b545(1.000G/4.129M) + # + # OLD_b545 is hidden and must be removed, but b5458d61 not. + # Normally we are not in this function when the delete action is + # executed but in `_liveLeafCoalesce`. + + if not self.scanError and not self.hidden and thisPrunable: vdiList.append(self) return vdiList @@ -1347,6 +1369,111 @@ def _calcExtraSpaceForSnapshotCoalescing(self): lvhdutil.calcSizeLV(self.getSizeVHD()) +class LinstorVDI(VDI): + """Object representing a VDI in a LINSTOR SR""" + + MAX_SIZE = 2 * 1024 * 1024 * 1024 * 1024 # Max VHD size. + + VOLUME_LOCK_TIMEOUT = 30 + + def load(self, info=None): + self.parentUuid = info.parentUuid + self.scanError = True + self.parent = None + self.children = [] + + self.fileName = self.sr._linstor.get_volume_name(self.uuid) + self.path = self.sr._linstor.build_device_path(self.fileName) + if not util.pathexists(self.path): + raise util.SMException( + '{} of {} not found' + .format(self.fileName, self.uuid) + ) + + if not info: + try: + info = self.sr._vhdutil.get_vhd_info(self.uuid) + except util.SMException: + Util.log( + ' [VDI {}: failed to read VHD metadata]'.format(self.uuid) + ) + return + + self.parentUuid = info.parentUuid + self.sizeVirt = info.sizeVirt + self._sizeVHD = info.sizePhys + self.hidden = info.hidden + self.scanError = False + + def rename(self, uuid): + Util.log('Renaming {} -> {} (path={})'.format( + self.uuid, uuid, self.path + )) + self.sr._linstor.update_volume_uuid(self.uuid, uuid) + VDI.rename(self, uuid) + + def delete(self): + if len(self.children) > 0: + raise util.SMException( + 'VDI {} has children, can\'t delete'.format(self.uuid) + ) + self.sr.lock() + try: + self.sr._linstor.destroy_volume(self.uuid) + self.sr.forgetVDI(self.uuid) + finally: + self.sr.unlock() + VDI.delete(self) + + def pauseVDIs(self, vdiList): + self.sr._linstor.ensure_volume_list_is_not_locked( + vdiList, timeout=self.VOLUME_LOCK_TIMEOUT + ) + return super(VDI).pauseVDIs(vdiList) + + def _liveLeafCoalesce(self, vdi): + self.sr._linstor.ensure_volume_is_not_locked( + vdi.uuid, timeout=self.VOLUME_LOCK_TIMEOUT + ) + return super(VDI)._liveLeafCoalesce(vdi) + + def _relinkSkip(self): + abortFlag = IPCFlag(self.sr.uuid) + for child in self.children: + if abortFlag.test(FLAG_TYPE_ABORT): + raise AbortException('Aborting due to signal') + Util.log( + ' Relinking {} from {} to {}'.format( + child, self, self.parent + ) + ) + + session = child.sr.xapi.session + sr_uuid = child.sr.uuid + vdi_uuid = child.uuid + try: + self.sr._linstor.ensure_volume_is_not_locked( + vdi_uuid, timeout=self.VOLUME_LOCK_TIMEOUT + ) + blktap2.VDI.tap_pause(session, sr_uuid, vdi_uuid) + child._setParent(self.parent) + finally: + blktap2.VDI.tap_unpause(session, sr_uuid, vdi_uuid) + self.children = [] + + def _setHidden(self, hidden=True): + HIDDEN_TAG = 'hidden' + + if self.raw: + self.sr._linstor.update_volume_metadata(self.uuid, { + HIDDEN_TAG: hidden + }) + self.hidden = hidden + else: + VDI._setHidden(self, hidden) + + def _queryVHDBlocks(self): + return self.sr._vhdutil.get_block_bitmap(self.uuid) ################################################################################ # @@ -1403,7 +1530,8 @@ def _getTreeStr(self, vdi, indent = 8): TYPE_FILE = "file" TYPE_LVHD = "lvhd" - TYPES = [TYPE_LVHD, TYPE_FILE] + TYPE_LINSTOR = "linstor" + TYPES = [TYPE_LVHD, TYPE_FILE, TYPE_LINSTOR] LOCK_RETRY_INTERVAL = 3 LOCK_RETRY_ATTEMPTS = 20 @@ -1424,6 +1552,8 @@ def getInstance(uuid, xapiSession, createLock = True, force = False): return FileSR(uuid, xapi, createLock, force) elif type == SR.TYPE_LVHD: return LVHDSR(uuid, xapi, createLock, force) + elif type == SR.TYPE_LINSTOR: + return LinstorSR(uuid, xapi, createLock, force) raise util.SMException("SR type %s not recognized" % type) getInstance = staticmethod(getInstance) @@ -2730,6 +2860,232 @@ def _updateSlavesOnResize(self, vdi): vdi.fileName, vdi.uuid, slaves) +class LinstorSR(SR): + TYPE = SR.TYPE_LINSTOR + + def __init__(self, uuid, xapi, createLock, force): + if not LINSTOR_AVAILABLE: + raise util.SMException( + 'Can\'t load cleanup LinstorSR: LINSTOR libraries are missing' + ) + + SR.__init__(self, uuid, xapi, createLock, force) + self._master_uri = 'linstor://localhost' + self.path = LinstorVolumeManager.DEV_ROOT_PATH + self._reloadLinstor() + + def deleteVDI(self, vdi): + self._checkSlaves(vdi) + SR.deleteVDI(self, vdi) + + def getFreeSpace(self): + return self._linstor.max_volume_size_allowed + + def scan(self, force=False): + all_vdi_info = self._scan(force) + for uuid, vdiInfo in all_vdi_info.iteritems(): + # When vdiInfo is None, the VDI is RAW. + vdi = self.getVDI(uuid) + if not vdi: + self.logFilter.logNewVDI(uuid) + vdi = LinstorVDI(self, uuid, not vdiInfo) + self.vdis[uuid] = vdi + if vdiInfo: + vdi.load(vdiInfo) + self._removeStaleVDIs(all_vdi_info.keys()) + self._buildTree(force) + self.logFilter.logState() + self._handleInterruptedCoalesceLeaf() + + def _reloadLinstor(self): + session = self.xapi.session + host_ref = util.get_this_host_ref(session) + sr_ref = session.xenapi.SR.get_by_uuid(self.uuid) + + pbd = util.find_my_pbd(session, host_ref, sr_ref) + if pbd is None: + raise util.SMException('Failed to find PBD') + + dconf = session.xenapi.PBD.get_device_config(pbd) + group_name = dconf['group-name'] + + self.journaler = LinstorJournaler( + self._master_uri, group_name, logger=util.SMlog + ) + + self._linstor = LinstorVolumeManager( + self._master_uri, + group_name, + repair=True, + logger=util.SMlog + ) + self._vhdutil = LinstorVhdUtil(session, self._linstor) + + def _scan(self, force): + for i in range(SR.SCAN_RETRY_ATTEMPTS): + self._reloadLinstor() + error = False + try: + all_vdi_info = self._load_vdi_info() + for uuid, vdiInfo in all_vdi_info.iteritems(): + if vdiInfo and vdiInfo.error: + error = True + break + if not error: + return all_vdi_info + Util.log('Scan error, retrying ({})'.format(i)) + except Exception as e: + Util.log('Scan exception, retrying ({}): {}'.format(i, e)) + Util.log(traceback.format_exc()) + + if force: + return all_vdi_info + raise util.SMException('Scan error') + + def _load_vdi_info(self): + all_vdi_info = {} + + # TODO: Ensure metadata contains the right info. + + all_volume_info = self._linstor.volumes_with_info + volumes_metadata = self._linstor.volumes_with_metadata + for vdi_uuid, volume_info in all_volume_info.items(): + try: + if not volume_info.name and \ + not list(volumes_metadata[vdi_uuid].items()): + continue # Ignore it, probably deleted. + + vdi_type = volumes_metadata[vdi_uuid][VDI_TYPE_TAG] + if vdi_type == vhdutil.VDI_TYPE_VHD: + info = self._vhdutil.get_vhd_info(vdi_uuid) + else: + info = None + except Exception as e: + Util.log( + ' [VDI {}: failed to load VDI info]: {}' + .format(self.uuid, e) + ) + info = vhdutil.VHDInfo(vdi_uuid) + info.error = 1 + all_vdi_info[vdi_uuid] = info + return all_vdi_info + + # TODO: Maybe implement _liveLeafCoalesce/_prepareCoalesceLeaf/ + # _finishCoalesceLeaf/_updateSlavesOnResize like LVM plugin. + + def _calcExtraSpaceNeeded(self, child, parent): + meta_overhead = vhdutil.calcOverheadEmpty(LinstorVDI.MAX_SIZE) + bitmap_overhead = vhdutil.calcOverheadBitmap(parent.sizeVirt) + virtual_size = LinstorVolumeManager.round_up_volume_size( + parent.sizeVirt + meta_overhead + bitmap_overhead + ) + # TODO: Check result. + return virtual_size - self._linstor.get_volume_size(parent.uuid) + + def _hasValidDevicePath(self, uuid): + try: + self._linstor.get_device_path(uuid) + except Exception: + # TODO: Maybe log exception. + return False + return True + + def _handleInterruptedCoalesceLeaf(self): + entries = self.journaler.get_all(VDI.JRN_LEAF) + for uuid, parentUuid in entries.iteritems(): + if self._hasValidDevicePath(parentUuid) or \ + self._hasValidDevicePath(self.TMP_RENAME_PREFIX + uuid): + self._undoInterruptedCoalesceLeaf(uuid, parentUuid) + else: + self._finishInterruptedCoalesceLeaf(uuid, parentUuid) + self.journaler.remove(VDI.JRN_LEAF, uuid) + vdi = self.getVDI(uuid) + if vdi: + vdi.ensureUnpaused() + + def _undoInterruptedCoalesceLeaf(self, childUuid, parentUuid): + Util.log('*** UNDO LEAF-COALESCE') + parent = self.getVDI(parentUuid) + if not parent: + parent = self.getVDI(childUuid) + if not parent: + raise util.SMException( + 'Neither {} nor {} found'.format(parentUuid, childUuid) + ) + Util.log( + 'Renaming parent back: {} -> {}'.format(childUuid, parentUuid) + ) + parent.rename(parentUuid) + util.fistpoint.activate('LVHDRT_coaleaf_undo_after_rename', self.uuid) + + child = self.getVDI(childUuid) + if not child: + child = self.getVDI(self.TMP_RENAME_PREFIX + childUuid) + if not child: + raise util.SMException( + 'Neither {} nor {} found'.format( + childUuid, self.TMP_RENAME_PREFIX + childUuid + ) + ) + Util.log('Renaming child back to {}'.format(childUuid)) + child.rename(childUuid) + Util.log('Updating the VDI record') + child.setConfig(VDI.DB_VHD_PARENT, parentUuid) + child.setConfig(VDI.DB_VDI_TYPE, vhdutil.VDI_TYPE_VHD) + util.fistpoint.activate( + 'LVHDRT_coaleaf_undo_after_rename2', self.uuid + ) + + # TODO: Maybe deflate here. + + if child.hidden: + child._setHidden(False) + if not parent.hidden: + parent._setHidden(True) + self._updateSlavesOnUndoLeafCoalesce(parent, child) + util.fistpoint.activate('LVHDRT_coaleaf_undo_end', self.uuid) + Util.log('*** leaf-coalesce undo successful') + if util.fistpoint.is_active('LVHDRT_coaleaf_stop_after_recovery'): + child.setConfig(VDI.DB_LEAFCLSC, VDI.LEAFCLSC_DISABLED) + + def _finishInterruptedCoalesceLeaf(self, childUuid, parentUuid): + Util.log('*** FINISH LEAF-COALESCE') + vdi = self.getVDI(childUuid) + if not vdi: + raise util.SMException('VDI {} not found'.format(childUuid)) + # TODO: Maybe inflate. + try: + self.forgetVDI(parentUuid) + except XenAPI.Failure: + pass + self._updateSlavesOnResize(vdi) + util.fistpoint.activate('LVHDRT_coaleaf_finish_end', self.uuid) + Util.log('*** finished leaf-coalesce successfully') + + def _checkSlaves(self, vdi): + try: + states = self._linstor.get_usage_states(vdi.uuid) + for node_name, state in states.items(): + self._checkSlave(node_name, vdi, state) + except LinstorVolumeManagerError as e: + if e.code != LinstorVolumeManagerError.ERR_VOLUME_NOT_EXISTS: + raise + + @staticmethod + def _checkSlave(node_name, vdi, state): + # If state is None, LINSTOR doesn't know the host state + # (bad connection?). + if state is None: + raise util.SMException( + 'Unknown state for VDI {} on {}'.format(vdi.uuid, node_name) + ) + + if state: + raise util.SMException( + 'VDI {} is in use on {}'.format(vdi.uuid, node_name) + ) + + ################################################################################ # # Helpers @@ -2770,7 +3126,9 @@ def normalizeType(type): "xfs", "zfs", "ext4" ]: type = SR.TYPE_FILE - if not type in SR.TYPES: + if type in ["linstor"]: + type = SR.TYPE_LINSTOR + if type not in SR.TYPES: raise util.SMException("Unsupported SR type: %s" % type) return type diff --git a/drivers/linstor-manager b/drivers/linstor-manager new file mode 100755 index 00000000..f7ce1809 --- /dev/null +++ b/drivers/linstor-manager @@ -0,0 +1,272 @@ +#!/usr/bin/env python +# +# Copyright (C) 2020 Vates SAS - ronan.abhamon@vates.fr +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import base64 +import distutils.util +import subprocess +import sys +import XenAPIPlugin + +sys.path.append('/opt/xensource/sm/') +from linstorjournaler import LinstorJournaler +from linstorvolumemanager import LinstorVolumeManager +from lock import Lock +import json +import LinstorSR +import util +import vhdutil + + +FIREWALL_PORT_SCRIPT = '/etc/xapi.d/plugins/firewall-port' +LINSTOR_PORTS = [3366, 3370, 3376, 3377, '7000:8000'] + + +def get_linstor_uri(session): + return 'linstor://{}'.format(util.get_master_rec(session)['address']) + + +def update_port(port, open): + fn = 'open' if open else 'close' + args = ( + FIREWALL_PORT_SCRIPT, fn, str(port), 'tcp' + ) + + (ret, out, err) = util.doexec(args) + if ret == 0: + return + raise Exception('Failed to {} port: {} {}'.format(fn, out, err)) + + +def update_all_ports(open): + for port in LINSTOR_PORTS: + update_port(port, open) + + +def update_service(start): + fn = 'enable' if start else 'disable' + args = ('systemctl', fn, '--now', 'linstor-satellite') + (ret, out, err) = util.doexec(args) + if ret == 0: + return + raise Exception('Failed to {} satellite: {} {}'.format(fn, out, err)) + + +def enable(session, args): + try: + enabled = distutils.util.strtobool(args['enabled']) + update_all_ports(open=enabled) + update_service(start=enabled) + return str(True) + except Exception as e: + util.SMlog('linstor-manager:disable error: {}'.format(e)) + return str(False) + + +def attach(session, args): + try: + sr_uuid = args['srUuid'] + vdi_uuid = args['vdiUuid'] + group_name = args['groupName'] + + linstor_uri = get_linstor_uri(session) + journaler = LinstorJournaler( + linstor_uri, group_name, logger=util.SMlog + ) + linstor = LinstorVolumeManager( + linstor_uri, + group_name, + logger=util.SMlog + ) + LinstorSR.attach_thin(session, journaler, linstor, sr_uuid, vdi_uuid) + return str(True) + except Exception as e: + util.SMlog('linstor-manager:attach error: {}'.format(e)) + return str(False) + + +def detach(session, args): + try: + sr_uuid = args['srUuid'] + vdi_uuid = args['vdiUuid'] + group_name = args['groupName'] + + linstor = LinstorVolumeManager( + get_linstor_uri(session), + group_name, + logger=util.SMlog + ) + LinstorSR.detach_thin(session, linstor, sr_uuid, vdi_uuid) + return str(True) + except Exception as e: + util.SMlog('linstor-manager:detach error: {}'.format(e)) + return str(False) + + +def check(session, args): + try: + device_path = args['devicePath'] + return str(vhdutil.check(device_path)) + except Exception as e: + util.SMlog('linstor-manager:check error: {}'.format(e)) + raise + + +def get_vhd_info(session, args): + try: + device_path = args['devicePath'] + group_name = args['groupName'] + include_parent = distutils.util.strtobool(args['includeParent']) + + linstor = LinstorVolumeManager( + get_linstor_uri(session), + group_name, + logger=util.SMlog + ) + + def extract_uuid(device_path): + # TODO: Remove new line in the vhdutil module. Not here. + return linstor.get_volume_uuid_from_device_path( + device_path.rstrip('\n') + ) + + vhd_info = vhdutil.getVHDInfo( + device_path, extract_uuid, include_parent + ) + return json.dumps(vhd_info.__dict__) + except Exception as e: + util.SMlog('linstor-manager:get_vhd_info error: {}'.format(e)) + raise + + +def has_parent(session, args): + try: + device_path = args['devicePath'] + return str(vhdutil.hasParent(device_path)) + except Exception as e: + util.SMlog('linstor-manager:has_parent error: {}'.format(e)) + raise + + +def get_parent(session, args): + try: + device_path = args['devicePath'] + group_name = args['groupName'] + + linstor = LinstorVolumeManager( + get_linstor_uri(session), + group_name, + logger=util.SMlog + ) + + def extract_uuid(device_path): + # TODO: Remove new line in the vhdutil module. Not here. + return linstor.get_volume_uuid_from_device_path( + device_path.rstrip('\n') + ) + + return vhdutil.getParent(device_path, extract_uuid) + except Exception as e: + util.SMlog('linstor-manager:get_parent error: {}'.format(e)) + raise + + +def get_size_virt(session, args): + try: + device_path = args['devicePath'] + return str(vhdutil.getSizeVirt(device_path)) + except Exception as e: + util.SMlog('linstor-manager:get_size_virt error: {}'.format(e)) + raise + + +def get_size_phys(session, args): + try: + device_path = args['devicePath'] + return str(vhdutil.getSizePhys(device_path)) + except Exception as e: + util.SMlog('linstor-manager:get_size_phys error: {}'.format(e)) + raise + + +def get_depth(session, args): + try: + device_path = args['devicePath'] + return str(vhdutil.getDepth(device_path)) + except Exception as e: + util.SMlog('linstor-manager:get_depth error: {}'.format(e)) + raise + + +def get_key_hash(session, args): + try: + device_path = args['devicePath'] + return vhdutil.getKeyHash(device_path) or '' + except Exception as e: + util.SMlog('linstor-manager:get_key_hash error: {}'.format(e)) + raise + + +def get_block_bitmap(session, args): + try: + device_path = args['devicePath'] + return base64.b64encode(vhdutil.getBlockBitmap(device_path)) or '' + except Exception as e: + util.SMlog('linstor-manager:get_block_bitmap error: {}'.format(e)) + raise + + +def lock_vdi(session, args): + lock = None + try: + sr_uuid = args['srUuid'] + vdi_uuid = args['vdiUuid'] + group_name = args['groupName'] + locked = distutils.util.strtobool(args['locked']) + + lock = Lock(vhdutil.LOCK_TYPE_SR, sr_uuid) + + linstor = LinstorVolumeManager( + get_linstor_uri(session), + group_name, + logger=util.SMlog + ) + linstor.lock_volume(vdi_uuid, locked) + + return str(True) + except Exception as e: + util.SMlog('linstor-manager:lock_vdi error: {}'.format(e)) + finally: + if lock: + lock.release() + return str(False) + + +if __name__ == '__main__': + XenAPIPlugin.dispatch({ + 'enable': enable, + 'attach': attach, + 'detach': detach, + 'check': check, + 'getVHDInfo': get_vhd_info, + 'hasParent': has_parent, + 'getParent': get_parent, + 'getSizeVirt': get_size_virt, + 'getSizePhys': get_size_phys, + 'getDepth': get_depth, + 'getKeyHash': get_key_hash, + 'getBlockBitmap': get_block_bitmap, + 'lockVdi': lock_vdi + }) diff --git a/drivers/linstorjournaler.py b/drivers/linstorjournaler.py new file mode 100755 index 00000000..74953305 --- /dev/null +++ b/drivers/linstorjournaler.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python +# +# Copyright (C) 2020 Vates SAS - ronan.abhamon@vates.fr +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + + +from linstorvolumemanager import LinstorVolumeManager +import linstor +import re +import util + + +class LinstorJournalerError(Exception): + pass + +# ============================================================================== + + +class LinstorJournaler: + """ + Simple journaler that uses LINSTOR properties for persistent "storage". + A journal is a id-value pair, and there can be only one journal for a + given id. An identifier is juste a transaction name. + """ + + REG_TYPE = re.compile('^([^/]+)$') + REG_TRANSACTION = re.compile('^[^/]+/([^/]+)$') + + """ + Types of transaction in the journal. + """ + CLONE = 'clone' + INFLATE = 'inflate' + + @staticmethod + def default_logger(*args): + print(args) + + def __init__(self, uri, group_name, logger=default_logger.__func__): + self._namespace = '{}journal/'.format( + LinstorVolumeManager._build_sr_namespace() + ) + + def connect(): + self._journal = linstor.KV( + LinstorVolumeManager._build_group_name(group_name), + uri=uri, + namespace=self._namespace + ) + + util.retry( + connect, + maxretry=60, + exceptions=[linstor.errors.LinstorNetworkError] + ) + self._logger = logger + + def create(self, type, identifier, value): + # TODO: Maybe rename to 'add' in the future (in Citrix code too). + + key = self._get_key(type, identifier) + + # 1. Ensure transaction doesn't exist. + current_value = self.get(type, identifier) + if current_value is not None: + raise LinstorJournalerError( + 'Journal transaction already exists for \'{}:{}\': {}' + .format(type, identifier, current_value) + ) + + # 2. Write! + try: + self._reset_namespace() + self._logger( + 'Create journal transaction \'{}:{}\''.format(type, identifier) + ) + self._journal[key] = str(value) + except Exception as e: + try: + self._journal.pop(key, 'empty') + except Exception as e2: + self._logger( + 'Failed to clean up failed journal write: {} (Ignored)' + .format(e2) + ) + + raise LinstorJournalerError( + 'Failed to write to journal: {}'.format(e) + ) + + def remove(self, type, identifier): + key = self._get_key(type, identifier) + try: + self._reset_namespace() + self._logger( + 'Destroy journal transaction \'{}:{}\'' + .format(type, identifier) + ) + self._journal.pop(key) + except Exception as e: + raise LinstorJournalerError( + 'Failed to remove transaction \'{}:{}\': {}' + .format(type, identifier, e) + ) + + def get(self, type, identifier): + return self._journal.get(self._get_key(type, identifier)) + + def get_all(self, type): + entries = {} + + self._journal.namespace = self._namespace + '{}/'.format(type) + for (key, value) in self._journal.items(): + res = self.REG_TYPE.match(key) + if res: + identifier = res.groups()[0] + entries[identifier] = value + return entries + + # Added to compatibility with Citrix API. + def getAll(self, type): + return self.get_all(type) + + def has_entries(self, identifier): + self._reset_namespace() + for (key, value) in self._journal.items(): + res = self.REG_TRANSACTION.match(key) + if res: + current_identifier = res.groups()[0] + if current_identifier == identifier: + return True + return False + + # Added to compatibility with Citrix API. + def hasJournals(self, identifier): + return self.has_entries(identifier) + + def _reset_namespace(self): + self._journal.namespace = self._namespace + + @staticmethod + def _get_key(type, identifier): + return '{}/{}'.format(type, identifier) diff --git a/drivers/linstorvhdutil.py b/drivers/linstorvhdutil.py new file mode 100644 index 00000000..f31c7525 --- /dev/null +++ b/drivers/linstorvhdutil.py @@ -0,0 +1,186 @@ +#!/usr/bin/env python +# +# Copyright (C) 2020 Vates SAS - ronan.abhamon@vates.fr +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import base64 +import distutils.util +import errno +import json +import socket +import util +import vhdutil +import xs_errors + +MANAGER_PLUGIN = 'linstor-manager' + + +def linstorhostcall(local_method, remote_method): + def decorated(func): + def wrapper(*args, **kwargs): + self = args[0] + vdi_uuid = args[1] + + device_path = self._linstor.build_device_path( + self._linstor.get_volume_name(vdi_uuid) + ) + + # A. Try a call using directly the DRBD device to avoid + # remote request. + + # Try to read locally if the device is not in use or if the device + # is up to date and not diskless. + (node_names, in_use) = \ + self._linstor.find_up_to_date_diskfull_nodes(vdi_uuid) + + try: + if not in_use or socket.gethostname() in node_names: + return local_method(device_path, *args[2:], **kwargs) + except util.CommandException as e: + # EMEDIUMTYPE constant (124) is not available in python2. + if e.code != errno.EROFS and e.code != 124: + raise + + # B. Execute the plugin on master or slave. + def exec_remote_method(): + host_ref = self._get_readonly_host( + vdi_uuid, device_path, node_names + ) + args = { + 'devicePath': device_path, + 'groupName': self._linstor.group_name + } + args.update(**kwargs) + + try: + response = self._session.xenapi.host.call_plugin( + host_ref, MANAGER_PLUGIN, remote_method, args + ) + except Exception as e: + util.SMlog('call-plugin ({} with {}) exception: {}'.format( + remote_method, args, e + )) + raise + + util.SMlog('call-plugin ({} with {}) returned: {}'.format( + remote_method, args, response + )) + if response == 'False': + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='Plugin {} failed'.format(MANAGER_PLUGIN) + ) + kwargs['response'] = response + + util.retry(exec_remote_method, 5, 3) + return func(*args, **kwargs) + return wrapper + return decorated + + +class LinstorVhdUtil: + def __init__(self, session, linstor): + self._session = session + self._linstor = linstor + + @linstorhostcall(vhdutil.check, 'check') + def check(self, vdi_uuid, **kwargs): + return distutils.util.strtobool(kwargs['response']) + + def get_vhd_info(self, vdi_uuid, include_parent=True): + kwargs = {'includeParent': str(include_parent)} + return self._get_vhd_info(vdi_uuid, self._extract_uuid, **kwargs) + + @linstorhostcall(vhdutil.getVHDInfo, 'getVHDInfo') + def _get_vhd_info(self, vdi_uuid, *args, **kwargs): + obj = json.loads(kwargs['response']) + + vhd_info = vhdutil.VHDInfo(vdi_uuid) + vhd_info.sizeVirt = obj['sizeVirt'] + vhd_info.sizePhys = obj['sizePhys'] + if 'parentPath' in obj: + vhd_info.parentPath = obj['parentPath'] + vhd_info.parentUuid = obj['parentUuid'] + vhd_info.hidden = obj['hidden'] + vhd_info.path = obj['path'] + + return vhd_info + + @linstorhostcall(vhdutil.hasParent, 'hasParent') + def has_parent(self, vdi_uuid, **kwargs): + return distutils.util.strtobool(kwargs['response']) + + def get_parent(self, vdi_uuid): + return self._get_parent(vdi_uuid, self._extract_uuid) + + @linstorhostcall(vhdutil.getParent, 'getParent') + def _get_parent(self, vdi_uuid, *args, **kwargs): + return kwargs['response'] + + @linstorhostcall(vhdutil.getSizeVirt, 'getSizeVirt') + def get_size_virt(self, vdi_uuid, **kwargs): + return int(kwargs['response']) + + @linstorhostcall(vhdutil.getSizePhys, 'getSizePhys') + def get_size_phys(self, vdi_uuid, **kwargs): + return int(kwargs['response']) + + @linstorhostcall(vhdutil.getDepth, 'getDepth') + def get_depth(self, vdi_uuid, **kwargs): + return int(kwargs['response']) + + @linstorhostcall(vhdutil.getKeyHash, 'getKeyHash') + def get_key_hash(self, vdi_uuid, **kwargs): + return kwargs['response'] or None + + @linstorhostcall(vhdutil.getBlockBitmap, 'getBlockBitmap') + def get_block_bitmap(self, vdi_uuid, **kwargs): + return base64.b64decode(kwargs['response']) + + # -------------------------------------------------------------------------- + # Helpers. + # -------------------------------------------------------------------------- + + def _extract_uuid(self, device_path): + # TODO: Remove new line in the vhdutil module. Not here. + return self._linstor.get_volume_uuid_from_device_path( + device_path.rstrip('\n') + ) + + def _get_readonly_host(self, vdi_uuid, device_path, node_names): + """ + When vhd-util is called to fetch VDI info we must find a + diskfull DRBD disk to read the data. It's the goal of this function. + Why? Because when a VHD is open in RO mode, the LVM layer is used + directly to bypass DRBD verifications (we can have only one process + that reads/writes to disk with DRBD devices). + """ + + if not node_names: + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='Unable to find diskfull node: {} (path={})' + .format(vdi_uuid, device_path) + ) + + hosts = self._session.xenapi.host.get_all_records() + for host_ref, host_record in hosts.items(): + if host_record['hostname'] in node_names: + return host_ref + + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='Unable to find a valid host from VDI: {} (path={})' + .format(vdi_uuid, device_path) + ) diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py new file mode 100755 index 00000000..d4004217 --- /dev/null +++ b/drivers/linstorvolumemanager.py @@ -0,0 +1,1713 @@ +#!/usr/bin/env python +# +# Copyright (C) 2020 Vates SAS - ronan.abhamon@vates.fr +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + + +import json +import linstor +import os.path +import re +import socket +import time +import util + + +def round_up(value, divisor): + assert divisor + divisor = int(divisor) + return int((int(value) + divisor - 1) / divisor) * divisor + + +def round_down(value, divisor): + assert divisor + value = int(value) + return value - (value % int(divisor)) + + +class LinstorVolumeManagerError(Exception): + ERR_GENERIC = 0, + ERR_VOLUME_EXISTS = 1, + ERR_VOLUME_NOT_EXISTS = 2 + + def __init__(self, message, code=ERR_GENERIC): + super(LinstorVolumeManagerError, self).__init__(message) + self._code = code + + @property + def code(self): + return self._code + +# ============================================================================== + +# Note: +# If a storage pool is not accessible after a network change: +# linstor node interface modify default --ip + + +class LinstorVolumeManager(object): + """ + API to manager LINSTOR volumes in XCP-ng. + A volume in this context is a physical part of the storage layer. + """ + + DEV_ROOT_PATH = '/dev/drbd/by-res/' + + # Default LVM extent size. + BLOCK_SIZE = 4 * 1024 * 1024 + + # List of volume properties. + PROP_METADATA = 'metadata' + PROP_NOT_EXISTS = 'not-exists' + PROP_VOLUME_NAME = 'volume-name' + PROP_IS_READONLY_TIMESTAMP = 'readonly-timestamp' + + # A volume can only be locked for a limited duration. + # The goal is to give enough time to slaves to execute some actions on + # a device before an UUID update or a coalesce for example. + # Expiration is expressed in seconds. + LOCKED_EXPIRATION_DELAY = 1 * 60 + + # Used when volume uuid is being updated. + PROP_UPDATING_UUID_SRC = 'updating-uuid-src' + + # States of property PROP_NOT_EXISTS. + STATE_EXISTS = '0' + STATE_NOT_EXISTS = '1' + STATE_CREATING = '2' + + # Property namespaces. + NAMESPACE_SR = 'xcp/sr' + NAMESPACE_VOLUME = 'volume' + + # Regex to match properties. + REG_PROP = '^([^/]+)/{}$' + + REG_METADATA = re.compile(REG_PROP.format(PROP_METADATA)) + REG_NOT_EXISTS = re.compile(REG_PROP.format(PROP_NOT_EXISTS)) + REG_VOLUME_NAME = re.compile(REG_PROP.format(PROP_VOLUME_NAME)) + REG_UPDATING_UUID_SRC = re.compile(REG_PROP.format(PROP_UPDATING_UUID_SRC)) + + # Prefixes of SR/VOLUME in the LINSTOR DB. + # A LINSTOR (resource, group, ...) name cannot start with a number. + # So we add a prefix behind our SR/VOLUME uuids. + PREFIX_SR = 'xcp-sr-' + PREFIX_VOLUME = 'xcp-volume-' + + @staticmethod + def default_logger(*args): + print(args) + + # -------------------------------------------------------------------------- + # API. + # -------------------------------------------------------------------------- + + class VolumeInfo(object): + __slots__ = ( + 'name', + 'physical_size', # Total physical size used by this volume on + # all disks. + 'virtual_size' # Total virtual available size of this volume + # (i.e. the user size at creation). + ) + + def __init__(self, name): + self.name = name + self.physical_size = 0 + self.virtual_size = 0 + + def __repr__(self): + return 'VolumeInfo("{}", {}, {})'.format( + self.name, self.physical_size, self.virtual_size + ) + + # -------------------------------------------------------------------------- + + def __init__( + self, uri, group_name, repair=False, logger=default_logger.__func__ + ): + """ + Create a new LinstorApi object. + :param str uri: URI to communicate with the LINSTOR controller. + :param str group_name: The SR goup name to use. + :param bool repair: If true we try to remove bad volumes due to a crash + or unexpected behavior. + :param function logger: Function to log messages. + """ + + self._uri = uri + self._linstor = self._create_linstor_instance(uri) + self._base_group_name = group_name + + # Ensure group exists. + group_name = self._build_group_name(group_name) + groups = self._linstor.resource_group_list_raise([group_name]) + groups = groups.resource_groups + if not groups: + raise LinstorVolumeManagerError( + 'Unable to find `{}` Linstor SR'.format(group_name) + ) + + # Ok. ;) + self._logger = logger + self._redundancy = groups[0].select_filter.place_count + self._group_name = group_name + self._build_volumes(repair=repair) + + @property + def group_name(self): + """ + Give the used group name. + :return: The group name. + :rtype: str + """ + return self._base_group_name + + @property + def volumes(self): + """ + Give the volumes uuid set. + :return: The volumes uuid set. + :rtype: set(str) + """ + return self._volumes + + @property + def volumes_with_name(self): + """ + Give a volume dictionnary that contains names actually owned. + :return: A volume/name dict. + :rtype: dict(str, str) + """ + return self._get_volumes_by_property(self.REG_VOLUME_NAME) + + @property + def volumes_with_info(self): + """ + Give a volume dictionnary that contains VolumeInfos. + :return: A volume/VolumeInfo dict. + :rtype: dict(str, VolumeInfo) + """ + + volumes = {} + + all_volume_info = self._get_volumes_info() + volume_names = self.volumes_with_name + for volume_uuid, volume_name in volume_names.items(): + if volume_name: + volume_info = all_volume_info.get(volume_name) + if volume_info: + volumes[volume_uuid] = volume_info + continue + + # Well I suppose if this volume is not available, + # LINSTOR has been used directly without using this API. + volumes[volume_uuid] = self.VolumeInfo('') + + return volumes + + @property + def volumes_with_metadata(self): + """ + Give a volume dictionnary that contains metadata. + :return: A volume/metadata dict. + :rtype: dict(str, dict) + """ + + volumes = {} + + metadata = self._get_volumes_by_property(self.REG_METADATA) + for volume_uuid, volume_metadata in metadata.items(): + if volume_metadata: + volume_metadata = json.loads(volume_metadata) + if isinstance(volume_metadata, dict): + volumes[volume_uuid] = volume_metadata + continue + raise LinstorVolumeManagerError( + 'Expected dictionary in volume metadata: {}' + .format(volume_uuid) + ) + + volumes[volume_uuid] = {} + + return volumes + + @property + def max_volume_size_allowed(self): + """ + Give the max volume size currently available in B. + :return: The current size. + :rtype: int + """ + + candidates = self._find_best_size_candidates() + if not candidates: + raise LinstorVolumeManagerError( + 'Failed to get max volume size allowed' + ) + + size = candidates[0].max_volume_size + if size < 0: + raise LinstorVolumeManagerError( + 'Invalid max volume size allowed given: {}'.format(size) + ) + return self.round_down_volume_size(size * 1024) + + @property + def physical_size(self): + """ + Give the total physical size of the SR. + :return: The physical size. + :rtype: int + """ + return self._compute_size('total_capacity') + + @property + def physical_free_size(self): + """ + Give the total free physical size of the SR. + :return: The physical free size. + :rtype: int + """ + return self._compute_size('free_capacity') + + @property + def total_allocated_volume_size(self): + """ + Give the sum of all created volumes. + :return: The physical required size to use the volumes. + :rtype: int + """ + + size = 0 + for resource in self._linstor.resource_list_raise().resources: + for volume in resource.volumes: + # We ignore diskless pools of the form "DfltDisklessStorPool". + if volume.storage_pool_name == self._group_name: + current_size = volume.usable_size + if current_size < 0: + raise LinstorVolumeManagerError( + 'Failed to get usable size of `{}` on `{}`' + .format(resource.name, volume.storage_pool_name) + ) + size += current_size + return size * 1024 + + @property + def metadata(self): + """ + Get the metadata of the SR. + :return: Dictionary that contains metadata. + :rtype: dict(str, dict) + """ + + sr_properties = self._get_sr_properties() + metadata = sr_properties.get(self.PROP_METADATA) + if metadata is not None: + metadata = json.loads(metadata) + if isinstance(metadata, dict): + return metadata + raise LinstorVolumeManagerError( + 'Expected dictionary in SR metadata: {}'.format( + self._group_name + ) + ) + + return {} + + @metadata.setter + def metadata(self, metadata): + """ + Set the metadata of the SR. + :param dict metadata: Dictionary that contains metadata. + """ + + assert isinstance(metadata, dict) + sr_properties = self._get_sr_properties() + sr_properties[self.PROP_METADATA] = json.dumps(metadata) + + @property + def disconnected_hosts(self): + """ + Get the list of disconnected hosts. + :return: Set that contains disconnected hosts. + :rtype: set(str) + """ + + pools = self._linstor.storage_pool_list_raise( + filter_by_stor_pools=[self._group_name] + ).storage_pools + + disconnected_hosts = set() + for pool in pools: + for report in pool.reports: + if report.ret_code & linstor.consts.WARN_NOT_CONNECTED == \ + linstor.consts.WARN_NOT_CONNECTED: + disconnected_hosts.add(pool.node_name) + break + return disconnected_hosts + + def check_volume_exists(self, volume_uuid): + """ + Check if a volume exists in the SR. + :return: True if volume exists. + :rtype: bool + """ + return volume_uuid in self._volumes + + def create_volume(self, volume_uuid, size, persistent=True): + """ + Create a new volume on the SR. + :param str volume_uuid: The volume uuid to use. + :param int size: volume size in B. + :param bool persistent: If false the volume will be unavailable + on the next constructor call LinstorSR(...). + :return: The current device path of the volume. + :rtype: str + """ + + self._logger('Creating LINSTOR volume {}...'.format(volume_uuid)) + volume_name = self.build_volume_name(util.gen_uuid()) + volume_properties = self._create_volume_with_properties( + volume_uuid, volume_name, size, place_resources=True + ) + + try: + self._logger( + 'Find device path of LINSTOR volume {}...'.format(volume_uuid) + ) + device_path = self._find_device_path(volume_uuid, volume_name) + if persistent: + volume_properties[self.PROP_NOT_EXISTS] = self.STATE_EXISTS + self._volumes.add(volume_uuid) + self._logger( + 'LINSTOR volume {} created!'.format(volume_uuid) + ) + return device_path + except Exception: + self._force_destroy_volume(volume_uuid, volume_properties) + raise + + def mark_volume_as_persistent(self, volume_uuid): + """ + Mark volume as persistent if created with persistent=False. + :param str volume_uuid: The volume uuid to mark. + """ + + self._ensure_volume_exists(volume_uuid) + + # Mark volume as persistent. + volume_properties = self._get_volume_properties(volume_uuid) + volume_properties[self.PROP_NOT_EXISTS] = self.STATE_EXISTS + + def destroy_volume(self, volume_uuid): + """ + Destroy a volume. + :param str volume_uuid: The volume uuid to destroy. + """ + + self._ensure_volume_exists(volume_uuid) + self.ensure_volume_is_not_locked(volume_uuid) + + # Mark volume as destroyed. + volume_properties = self._get_volume_properties(volume_uuid) + volume_properties[self.PROP_NOT_EXISTS] = self.STATE_NOT_EXISTS + + self._volumes.remove(volume_uuid) + self._destroy_volume(volume_uuid, volume_properties) + + def lock_volume(self, volume_uuid, locked=True): + """ + Prevent modifications of the volume properties during + "self.LOCKED_EXPIRATION_DELAY" seconds. The SR must be locked + when used. This method is useful to attach/detach correctly a volume on + a slave. Without it the GC can rename a volume, in this case the old + volume path can be used by a slave... + :param str volume_uuid: The volume uuid to protect/unprotect. + :param bool locked: Lock/unlock the volume. + """ + + self._ensure_volume_exists(volume_uuid) + + self._logger( + '{} volume {} as locked'.format( + 'Mark' if locked else 'Unmark', + volume_uuid + ) + ) + + volume_properties = self._get_volume_properties(volume_uuid) + if locked: + volume_properties[ + self.PROP_IS_READONLY_TIMESTAMP + ] = str(time.time()) + elif self.PROP_IS_READONLY_TIMESTAMP in volume_properties: + volume_properties.pop(self.PROP_IS_READONLY_TIMESTAMP) + + def ensure_volume_is_not_locked(self, volume_uuid, timeout=None): + """ + Ensure a volume is not locked. Wait if necessary. + :param str volume_uuid: The volume uuid to check. + :param int timeout: If the volume is always locked after the expiration + of the timeout, an exception is thrown. + """ + return self.ensure_volume_list_is_not_locked([volume_uuid], timeout) + + def ensure_volume_list_is_not_locked(self, volume_uuids, timeout=None): + checked = set() + for volume_uuid in volume_uuids: + if volume_uuid in self._volumes: + checked.add(volume_uuid) + + if not checked: + return + + waiting = False + + start = time.time() + while True: + # Can't delete in for loop, use a copy of the list. + remaining = checked.copy() + for volume_uuid in checked: + volume_properties = self._get_volume_properties(volume_uuid) + timestamp = volume_properties.get( + self.PROP_IS_READONLY_TIMESTAMP + ) + if timestamp is None: + remaining.remove(volume_uuid) + continue + + now = time.time() + if now - float(timestamp) > self.LOCKED_EXPIRATION_DELAY: + self._logger( + 'Remove readonly timestamp on {}'.format(volume_uuid) + ) + volume_properties.pop(self.PROP_IS_READONLY_TIMESTAMP) + remaining.remove(volume_uuid) + continue + + if not waiting: + self._logger( + 'Volume {} is locked, waiting...'.format(volume_uuid) + ) + waiting = True + break + + if not remaining: + break + checked = remaining + + if timeout is not None and now - start > timeout: + raise LinstorVolumeManagerError( + 'volume `{}` is locked and timeout has been reached' + .format(volume_uuid), + LinstorVolumeManagerError.ERR_VOLUME_NOT_EXISTS + ) + + # We must wait to use the volume. After that we can modify it + # ONLY if the SR is locked to avoid bad reads on the slaves. + time.sleep(1) + + if waiting: + self._logger('No volume locked now!') + + def introduce_volume(self, volume_uuid): + pass # TODO: Implement me. + + def resize_volume(self, volume_uuid, new_size): + """ + Resize a volume. + :param str volume_uuid: The volume uuid to resize. + :param int new_size: New size in B. + """ + + volume_name = self.get_volume_name(volume_uuid) + self.ensure_volume_is_not_locked(volume_uuid) + new_size = self.round_up_volume_size(new_size) + + result = self._linstor.volume_dfn_modify( + rsc_name=volume_name, + volume_nr=0, + size=new_size / 1024 + ) + error_str = self._get_error_str(result) + if error_str: + raise LinstorVolumeManagerError( + 'Could not resize volume `{}` from SR `{}`: {}' + .format(volume_uuid, self._group_name, error_str) + ) + + def get_volume_name(self, volume_uuid): + """ + Get the name of a particular volume. + :param str volume_uuid: The volume uuid of the name to get. + :return: The volume name. + :rtype: str + """ + + self._ensure_volume_exists(volume_uuid) + volume_properties = self._get_volume_properties(volume_uuid) + volume_name = volume_properties.get(self.PROP_VOLUME_NAME) + if volume_name: + return volume_name + raise LinstorVolumeManagerError( + 'Failed to get volume name of {}'.format(volume_uuid) + ) + + def get_volume_size(self, volume_uuid): + """ + Get the size of a particular volume. + :param str volume_uuid: The volume uuid of the size to get. + :return: The volume size. + :rtype: int + """ + + volume_name = self.get_volume_name(volume_uuid) + dfns = self._linstor.resource_dfn_list_raise( + query_volume_definitions=True, + filter_by_resource_definitions=[volume_name] + ).resource_definitions + + size = dfns[0].volume_definitions[0].size + if size < 0: + raise LinstorVolumeManagerError( + 'Failed to get volume size of: {}'.format(volume_uuid) + ) + return size * 1024 + + def get_volume_info(self, volume_uuid): + """ + Get the volume info of a particular volume. + :param str volume_uuid: The volume uuid of the volume info to get. + :return: The volume info. + :rtype: VolumeInfo + """ + + volume_name = self.get_volume_name(volume_uuid) + return self._get_volumes_info(filter=[volume_name])[volume_name] + + def get_device_path(self, volume_uuid): + """ + Get the dev path of a volume. + :param str volume_uuid: The volume uuid to get the dev path. + :return: The current device path of the volume. + :rtype: str + """ + + volume_name = self.get_volume_name(volume_uuid) + return self._find_device_path(volume_uuid, volume_name) + + def get_volume_uuid_from_device_path(self, device_path): + """ + Get the volume uuid of a device_path. + :param str device_path: The dev path to find the volume uuid. + :return: The volume uuid of the local device path. + :rtype: str + """ + + expected_volume_name = \ + self.get_volume_name_from_device_path(device_path) + + volume_names = self.volumes_with_name + for volume_uuid, volume_name in volume_names.items(): + if volume_name == expected_volume_name: + return volume_uuid + + raise LinstorVolumeManagerError( + 'Unable to find volume uuid from dev path `{}`'.format(device_path) + ) + + def get_volume_name_from_device_path(self, device_path): + """ + Get the volume name of a device_path on the current host. + :param str device_path: The dev path to find the volume name. + :return: The volume name of the local device path. + :rtype: str + """ + + node_name = socket.gethostname() + resources = self._linstor.resource_list_raise( + filter_by_nodes=[node_name] + ).resources + + real_device_path = os.path.realpath(device_path) + for resource in resources: + if resource.volumes[0].device_path == real_device_path: + return resource.name + + raise LinstorVolumeManagerError( + 'Unable to find volume name from dev path `{}`' + .format(device_path) + ) + + def update_volume_uuid(self, volume_uuid, new_volume_uuid, force=False): + """ + Change the uuid of a volume. + :param str volume_uuid: The volume to modify. + :param str new_volume_uuid: The new volume uuid to use. + :param bool force: If true we doesn't check if volume_uuid is in the + volume list. I.e. the volume can be marked as deleted but the volume + can still be in the LINSTOR KV store if the deletion has failed. + In specific cases like "undo" after a failed clone we must rename a bad + deleted VDI. + """ + + self._logger( + 'Trying to update volume UUID {} to {}...' + .format(volume_uuid, new_volume_uuid) + ) + if not force: + self._ensure_volume_exists(volume_uuid) + self.ensure_volume_is_not_locked(volume_uuid) + + if new_volume_uuid in self._volumes: + raise LinstorVolumeManagerError( + 'Volume `{}` already exists'.format(new_volume_uuid), + LinstorVolumeManagerError.ERR_VOLUME_EXISTS + ) + + volume_properties = self._get_volume_properties(volume_uuid) + if volume_properties.get(self.PROP_UPDATING_UUID_SRC): + raise LinstorVolumeManagerError( + 'Cannot update volume uuid {}: invalid state' + .format(volume_uuid) + ) + + new_volume_properties = self._get_volume_properties( + new_volume_uuid + ) + if list(new_volume_properties.items()): + raise LinstorVolumeManagerError( + 'Cannot update volume uuid {} to {}: ' + .format(volume_uuid, new_volume_uuid) + + 'this last one is not empty' + ) + + assert volume_properties.namespace != \ + new_volume_properties.namespace + + try: + # 1. Mark new volume properties with PROP_UPDATING_UUID_SRC. + # If we crash after that, the new properties can be removed + # properly. + new_volume_properties[self.PROP_NOT_EXISTS] = self.STATE_NOT_EXISTS + new_volume_properties[self.PROP_UPDATING_UUID_SRC] = volume_uuid + + # 2. Copy the properties. + for property in [self.PROP_METADATA, self.PROP_VOLUME_NAME]: + new_volume_properties[property] = \ + volume_properties.get(property) + + # 3. Ok! + new_volume_properties[self.PROP_NOT_EXISTS] = self.STATE_EXISTS + except Exception as e: + try: + new_volume_properties.clear() + except Exception as e: + self._logger( + 'Failed to clear new volume properties: {} (ignoring...)' + .format(e) + ) + raise LinstorVolumeManagerError( + 'Failed to copy volume properties: {}'.format(e) + ) + + try: + # 4. After this point, it's ok we can remove the + # PROP_UPDATING_UUID_SRC property and clear the src properties + # without problems. + volume_properties.clear() + new_volume_properties.pop(self.PROP_UPDATING_UUID_SRC) + except Exception as e: + raise LinstorVolumeManagerError( + 'Failed to clear volume properties ' + 'after volume uuid update: {}'.format(e) + ) + + self._volumes.remove(volume_uuid) + self._volumes.add(new_volume_uuid) + + self._logger( + 'UUID update succeeded of {} to {}! (properties={})' + .format( + volume_uuid, new_volume_uuid, + self._get_filtered_properties(new_volume_properties) + ) + ) + + def update_volume_name(self, volume_uuid, volume_name): + """ + Change the volume name of a volume. + :param str volume_uuid: The volume to modify. + :param str volume_name: The volume_name to use. + """ + + self._ensure_volume_exists(volume_uuid) + self.ensure_volume_is_not_locked(volume_uuid) + if not volume_name.startswith(self.PREFIX_VOLUME): + raise LinstorVolumeManagerError( + 'Volume name `{}` must be start with `{}`' + .format(volume_name, self.PREFIX_VOLUME) + ) + + if volume_name not in self._fetch_resource_names(): + raise LinstorVolumeManagerError( + 'Volume `{}` doesn\'t exist'.format(volume_name) + ) + + volume_properties = self._get_volume_properties(volume_uuid) + volume_properties[self.PROP_VOLUME_NAME] = volume_name + + def get_usage_states(self, volume_uuid): + """ + Check if a volume is currently used. + :param str volume_uuid: The volume uuid to check. + :return: A dictionnary that contains states. + :rtype: dict(str, bool or None) + """ + + states = {} + + volume_name = self.get_volume_name(volume_uuid) + for resource_state in self._linstor.resource_list_raise( + filter_by_resources=[volume_name] + ).resource_states: + states[resource_state.node_name] = resource_state.in_use + + return states + + def get_volume_metadata(self, volume_uuid): + """ + Get the metadata of a volume. + :return: Dictionary that contains metadata. + :rtype: dict + """ + + self._ensure_volume_exists(volume_uuid) + volume_properties = self._get_volume_properties(volume_uuid) + metadata = volume_properties.get(self.PROP_METADATA) + if metadata: + metadata = json.loads(metadata) + if isinstance(metadata, dict): + return metadata + raise LinstorVolumeManagerError( + 'Expected dictionary in volume metadata: {}' + .format(volume_uuid) + ) + return {} + + def set_volume_metadata(self, volume_uuid, metadata): + """ + Set the metadata of a volume. + :param dict metadata: Dictionary that contains metadata. + """ + + self._ensure_volume_exists(volume_uuid) + self.ensure_volume_is_not_locked(volume_uuid) + + assert isinstance(metadata, dict) + volume_properties = self._get_volume_properties(volume_uuid) + volume_properties[self.PROP_METADATA] = json.dumps(metadata) + + def update_volume_metadata(self, volume_uuid, metadata): + """ + Update the metadata of a volume. It modify only the given keys. + It doesn't remove unreferenced key instead of set_volume_metadata. + :param dict metadata: Dictionary that contains metadata. + """ + + self._ensure_volume_exists(volume_uuid) + self.ensure_volume_is_not_locked(volume_uuid) + + assert isinstance(metadata, dict) + volume_properties = self._get_volume_properties(volume_uuid) + + current_metadata = json.loads( + volume_properties.get(self.PROP_METADATA, '{}') + ) + if not isinstance(metadata, dict): + raise LinstorVolumeManagerError( + 'Expected dictionary in volume metadata: {}' + .format(volume_uuid) + ) + + for key, value in metadata.items(): + current_metadata[key] = value + volume_properties[self.PROP_METADATA] = json.dumps(current_metadata) + + def shallow_clone_volume(self, volume_uuid, clone_uuid, persistent=True): + """ + Clone a volume. Do not copy the data, this method creates a new volume + with the same size. It tries to create the volume on the same host + than volume source. + :param str volume_uuid: The volume to clone. + :param str clone_uuid: The cloned volume. + :param bool persistent: If false the volume will be unavailable + on the next constructor call LinstorSR(...). + :return: The current device path of the cloned volume. + :rtype: str + """ + + volume_name = self.get_volume_name(volume_uuid) + self.ensure_volume_is_not_locked(volume_uuid) + + # 1. Find ideal nodes + size to use. + ideal_node_names, size = self._get_volume_node_names_and_size( + volume_name + ) + if size <= 0: + raise LinstorVolumeManagerError( + 'Invalid size of {} for volume `{}`'.format(size, volume_name) + ) + + # 2. Find the node(s) with the maximum space. + candidates = self._find_best_size_candidates() + if not candidates: + raise LinstorVolumeManagerError( + 'Unable to shallow clone volume `{}`, no free space found.' + ) + + # 3. Compute node names and search if we can try to clone + # on the same nodes than volume. + def find_best_nodes(): + for candidate in candidates: + for node_name in candidate.node_names: + if node_name in ideal_node_names: + return candidate.node_names + + node_names = find_best_nodes() + if not node_names: + node_names = candidates[0].node_names + + if len(node_names) < self._redundancy: + raise LinstorVolumeManagerError( + 'Unable to shallow clone volume `{}`, '.format(volume_uuid) + + '{} are required to clone, found: {}'.format( + self._redundancy, len(node_names) + ) + ) + + # 4. Compute resources to create. + clone_volume_name = self.build_volume_name(util.gen_uuid()) + diskless_node_names = self._get_node_names() + resources = [] + for node_name in node_names: + diskless_node_names.remove(node_name) + resources.append(linstor.ResourceData( + node_name=node_name, + rsc_name=clone_volume_name, + storage_pool=self._group_name + )) + for node_name in diskless_node_names: + resources.append(linstor.ResourceData( + node_name=node_name, + rsc_name=clone_volume_name, + diskless=True + )) + + # 5. Create resources! + def clean(properties): + try: + self._destroy_volume(clone_uuid, properties) + except Exception as e: + self._logger( + 'Unable to destroy volume {} after shallow clone fail: {}' + .format(clone_uuid, e) + ) + + def create(): + try: + volume_properties = self._create_volume_with_properties( + clone_uuid, clone_volume_name, size, + place_resources=False + ) + + result = self._linstor.resource_create(resources) + error_str = self._get_error_str(result) + if error_str: + raise LinstorVolumeManagerError( + 'Could not create cloned volume `{}` of `{}` from ' + 'SR `{}`: {}'.format( + clone_uuid, volume_uuid, self._group_name, + error_str + ) + ) + return volume_properties + except Exception: + clean(volume_properties) + raise + + # Retry because we can get errors like this: + # "Resource disappeared while waiting for it to be ready" or + # "Resource did not became ready on node 'XXX' within reasonable time, check Satellite for errors." + # in the LINSTOR server. + volume_properties = util.retry(create, maxretry=5) + + try: + device_path = self._find_device_path(clone_uuid, clone_volume_name) + if persistent: + volume_properties[self.PROP_NOT_EXISTS] = self.STATE_EXISTS + self._volumes.add(clone_uuid) + return device_path + except Exception as e: + clean(volume_properties) + raise + + def remove_resourceless_volumes(self): + """ + Remove all volumes without valid or non-empty name + (i.e. without LINSTOR resource). It's different than + LinstorVolumeManager constructor that takes a `repair` param that + removes volumes with `PROP_NOT_EXISTS` to 1. + """ + + resource_names = self._fetch_resource_names() + for volume_uuid, volume_name in self.volumes_with_name.items(): + if not volume_name or volume_name not in resource_names: + self.destroy_volume(volume_uuid) + + def destroy(self, force=False): + """ + Destroy this SR. Object should not be used after that. + :param bool force: Try to destroy volumes before if true. + """ + + if (force): + for volume_uuid in self._volumes: + self.destroy_volume(volume_uuid) + + # TODO: Throw exceptions in the helpers below if necessary. + # TODO: What's the required action if it exists remaining volumes? + + self._destroy_resource_group(self._linstor, self._group_name) + + pools = self._linstor.storage_pool_list_raise( + filter_by_stor_pools=[self._group_name] + ).storage_pools + for pool in pools: + self._destroy_storage_pool( + self._linstor, pool.name, pool.node_name + ) + + def find_up_to_date_diskfull_nodes(self, volume_uuid): + """ + Find all nodes that contain a specific volume using diskfull disks. + The disk must be up to data to be used. + :param str volume_uuid: The volume to use. + :return: The available nodes. + :rtype: tuple(set(str), bool) + """ + + volume_name = self.get_volume_name(volume_uuid) + + in_use = False + node_names = set() + resource_list = self._linstor.resource_list_raise( + filter_by_resources=[volume_name] + ) + for resource_state in resource_list.resource_states: + volume_state = resource_state.volume_states[0] + if volume_state.disk_state == 'UpToDate': + node_names.add(resource_state.node_name) + if resource_state.in_use: + in_use = True + + return (node_names, in_use) + + @classmethod + def create_sr( + cls, uri, group_name, node_names, redundancy, + thin_provisioning=False, + logger=default_logger.__func__ + ): + """ + Create a new SR on the given nodes. + :param str uri: URI to communicate with the LINSTOR controller. + :param str group_name: The SR group_name to use. + :param list[str] node_names: String list of nodes. + :param int redundancy: How many copy of volumes should we store? + :param function logger: Function to log messages. + :return: A new LinstorSr instance. + :rtype: LinstorSr + """ + + # 1. Check if SR already exists. + lin = cls._create_linstor_instance(uri) + driver_pool_name = group_name + group_name = cls._build_group_name(group_name) + pools = lin.storage_pool_list_raise(filter_by_stor_pools=[group_name]) + + # TODO: Maybe if the SR already exists and if the nodes are the same, + # we can try to use it directly. + pools = pools.storage_pools + if pools: + existing_node_names = map(lambda pool: pool.node_name, pools) + raise LinstorVolumeManagerError( + 'Unable to create SR `{}`. It already exists on node(s): {}' + .format(group_name, existing_node_names) + ) + + if lin.resource_group_list_raise( + [group_name] + ).resource_groups: + raise LinstorVolumeManagerError( + 'Unable to create SR `{}`: The group name already exists' + .format(group_name) + ) + + if thin_provisioning: + driver_pool_parts = driver_pool_name.split('/') + if not len(driver_pool_parts) == 2: + raise LinstorVolumeManagerError( + 'Invalid group name using thin provisioning. ' + 'Expected format: \'VG/LV`\'' + ) + + # 2. Create storage pool on each node + resource group. + i = 0 + try: + # 2.a. Create storage pools. + while i < len(node_names): + node_name = node_names[i] + + result = lin.storage_pool_create( + node_name=node_name, + storage_pool_name=group_name, + storage_driver='LVM_THIN' if thin_provisioning else 'LVM', + driver_pool_name=driver_pool_name + ) + + error_str = cls._get_error_str(result) + if error_str: + raise LinstorVolumeManagerError( + 'Could not create SP `{}` on node `{}`: {}'.format( + group_name, + node_name, + error_str + ) + ) + i += 1 + + # 2.b. Create resource group. + result = lin.resource_group_create( + name=group_name, + place_count=redundancy, + storage_pool=group_name, + diskless_on_remaining=True + ) + error_str = cls._get_error_str(result) + if error_str: + raise LinstorVolumeManagerError( + 'Could not create RG `{}`: {}'.format( + group_name, error_str + ) + ) + + # 2.c. Create volume group. + result = lin.volume_group_create(group_name) + error_str = cls._get_error_str(result) + if error_str: + raise LinstorVolumeManagerError( + 'Could not create VG `{}`: {}'.format( + group_name, error_str + ) + ) + + # 3. Remove storage pools/resource/volume group in the case of errors. + except Exception as e: + try: + cls._destroy_resource_group(lin, group_name) + except Exception: + pass + j = 0 + i = min(i, len(node_names) - 1) + while j <= i: + try: + cls._destroy_storage_pool(lin, group_name, node_names[j]) + except Exception: + pass + j += 1 + raise e + + # 4. Return new instance. + instance = cls.__new__(cls) + instance._uri = uri + instance._linstor = lin + instance._logger = logger + instance._redundancy = redundancy + instance._group_name = group_name + instance._volumes = set() + return instance + + @classmethod + def build_device_path(cls, volume_name): + """ + Build a device path given a volume name. + :param str volume_name: The volume name to use. + :return: A valid or not device path. + :rtype: str + """ + + return '{}{}/0'.format(cls.DEV_ROOT_PATH, volume_name) + + @classmethod + def build_volume_name(cls, base_name): + """ + Build a volume name given a base name (i.e. a UUID). + :param str volume_name: The volume name to use. + :return: A valid or not device path. + :rtype: str + """ + return '{}{}'.format(cls.PREFIX_VOLUME, base_name) + + @classmethod + def round_up_volume_size(cls, volume_size): + """ + Align volume size on higher multiple of BLOCK_SIZE. + :param int volume_size: The volume size to align. + :return: An aligned volume size. + :rtype: int + """ + return round_up(volume_size, cls.BLOCK_SIZE) + + @classmethod + def round_down_volume_size(cls, volume_size): + """ + Align volume size on lower multiple of BLOCK_SIZE. + :param int volume_size: The volume size to align. + :return: An aligned volume size. + :rtype: int + """ + return round_down(volume_size, cls.BLOCK_SIZE) + + # -------------------------------------------------------------------------- + # Private helpers. + # -------------------------------------------------------------------------- + + def _ensure_volume_exists(self, volume_uuid): + if volume_uuid not in self._volumes: + raise LinstorVolumeManagerError( + 'volume `{}` doesn\'t exist'.format(volume_uuid), + LinstorVolumeManagerError.ERR_VOLUME_NOT_EXISTS + ) + + def _find_best_size_candidates(self): + result = self._linstor.resource_group_qmvs(self._group_name) + error_str = self._get_error_str(result) + if error_str: + raise LinstorVolumeManagerError( + 'Failed to get max volume size allowed of SR `{}`: {}'.format( + self._group_name, + error_str + ) + ) + return result[0].candidates + + def _fetch_resource_names(self): + resource_names = set() + dfns = self._linstor.resource_dfn_list_raise().resource_definitions + for dfn in dfns: + if dfn.resource_group_name == self._group_name and \ + linstor.consts.FLAG_DELETE not in dfn.flags: + resource_names.add(dfn.name) + return resource_names + + def _get_volumes_info(self, filter=None): + all_volume_info = {} + resources = self._linstor.resource_list_raise( + filter_by_resources=filter + ) + for resource in resources.resources: + if resource.name not in all_volume_info: + current = all_volume_info[resource.name] = self.VolumeInfo( + resource.name + ) + else: + current = all_volume_info[resource.name] + + for volume in resource.volumes: + # We ignore diskless pools of the form "DfltDisklessStorPool". + if volume.storage_pool_name == self._group_name: + if volume.allocated_size < 0: + raise LinstorVolumeManagerError( + 'Failed to get allocated size of `{}` on `{}`' + .format(resource.name, volume.storage_pool_name) + ) + current.physical_size += volume.allocated_size + + if volume.usable_size < 0: + raise LinstorVolumeManagerError( + 'Failed to get usable size of `{}` on `{}`' + .format(resource.name, volume.storage_pool_name) + ) + virtual_size = volume.usable_size + + current.virtual_size = current.virtual_size and \ + min(current.virtual_size, virtual_size) or virtual_size + + for current in all_volume_info.values(): + current.physical_size *= 1024 + current.virtual_size *= 1024 + + return all_volume_info + + def _get_volume_node_names_and_size(self, volume_name): + node_names = set() + size = -1 + for resource in self._linstor.resource_list_raise( + filter_by_resources=[volume_name] + ).resources: + for volume in resource.volumes: + # We ignore diskless pools of the form "DfltDisklessStorPool". + if volume.storage_pool_name == self._group_name: + node_names.add(resource.node_name) + + current_size = volume.usable_size + if current_size < 0: + raise LinstorVolumeManagerError( + 'Failed to get usable size of `{}` on `{}`' + .format(resource.name, volume.storage_pool_name) + ) + + if size < 0: + size = current_size + else: + size = min(size, current_size) + + return (node_names, size * 1024) + + def _compute_size(self, attr): + pools = self._linstor.storage_pool_list_raise( + filter_by_stor_pools=[self._group_name] + ).storage_pools + + capacity = 0 + for pool in pools: + space = pool.free_space + if space: + size = getattr(space, attr) + if size < 0: + raise LinstorVolumeManagerError( + 'Failed to get pool {} attr of `{}`' + .format(attr, pool.node_name) + ) + capacity += size + return capacity * 1024 + + def _get_node_names(self): + node_names = set() + pools = self._linstor.storage_pool_list_raise( + filter_by_stor_pools=[self._group_name] + ).storage_pools + for pool in pools: + node_names.add(pool.node_name) + return node_names + + def _check_volume_creation_errors(self, result, volume_uuid): + errors = self._filter_errors(result) + if self._check_errors(errors, [ + linstor.consts.FAIL_EXISTS_RSC, linstor.consts.FAIL_EXISTS_RSC_DFN + ]): + raise LinstorVolumeManagerError( + 'Failed to create volume `{}` from SR `{}`, it already exists' + .format(volume_uuid, self._group_name), + LinstorVolumeManagerError.ERR_VOLUME_EXISTS + ) + + if errors: + raise LinstorVolumeManagerError( + 'Failed to create volume `{}` from SR `{}`: {}'.format( + volume_uuid, + self._group_name, + self._get_error_str(errors) + ) + ) + + def _create_volume(self, volume_uuid, volume_name, size, place_resources): + size = self.round_up_volume_size(size) + + self._check_volume_creation_errors(self._linstor.resource_group_spawn( + rsc_grp_name=self._group_name, + rsc_dfn_name=volume_name, + vlm_sizes=['{}B'.format(size)], + definitions_only=not place_resources + ), volume_uuid) + + def _create_volume_with_properties( + self, volume_uuid, volume_name, size, place_resources + ): + if self.check_volume_exists(volume_uuid): + raise LinstorVolumeManagerError( + 'Could not create volume `{}` from SR `{}`, it already exists' + .format(volume_uuid, self._group_name) + ' in properties', + LinstorVolumeManagerError.ERR_VOLUME_EXISTS + ) + + if volume_name in self._fetch_resource_names(): + raise LinstorVolumeManagerError( + 'Could not create volume `{}` from SR `{}`, '.format( + volume_uuid, self._group_name + ) + 'resource of the same name already exists in LINSTOR' + ) + + # I am paranoid. + volume_properties = self._get_volume_properties(volume_uuid) + if (volume_properties.get(self.PROP_NOT_EXISTS) is not None): + raise LinstorVolumeManagerError( + 'Could not create volume `{}`, '.format(volume_uuid) + + 'properties already exist' + ) + + try: + volume_properties[self.PROP_NOT_EXISTS] = self.STATE_CREATING + volume_properties[self.PROP_VOLUME_NAME] = volume_name + + self._create_volume( + volume_uuid, volume_name, size, place_resources + ) + + return volume_properties + except LinstorVolumeManagerError as e: + # Do not destroy existing resource! + # In theory we can't get this error because we check this event + # before the `self._create_volume` case. + # It can only happen if the same volume uuid is used in the same + # call in another host. + if e.code == LinstorVolumeManagerError.ERR_VOLUME_EXISTS: + raise + self._force_destroy_volume(volume_uuid, volume_properties) + raise + except Exception: + self._force_destroy_volume(volume_uuid, volume_properties) + raise + + def _find_device_path(self, volume_uuid, volume_name): + current_device_path = self._request_device_path( + volume_uuid, volume_name, activate=True + ) + + # We use realpath here to get the /dev/drbd path instead of + # /dev/drbd/by-res/. + expected_device_path = self.build_device_path(volume_name) + util.wait_for_path(expected_device_path, 5) + + device_realpath = os.path.realpath(expected_device_path) + if current_device_path != device_realpath: + raise LinstorVolumeManagerError( + 'Invalid path, current={}, expected={} (realpath={})' + .format( + current_device_path, + expected_device_path, + device_realpath + ) + ) + return expected_device_path + + def _request_device_path(self, volume_uuid, volume_name, activate=False): + node_name = socket.gethostname() + resources = self._linstor.resource_list( + filter_by_nodes=[node_name], + filter_by_resources=[volume_name] + ) + + if not resources or not resources[0]: + raise LinstorVolumeManagerError( + 'No response list for dev path of `{}`'.format(volume_uuid) + ) + if isinstance(resources[0], linstor.responses.ResourceResponse): + if not resources[0].resources: + if activate: + self._activate_device_path(node_name, volume_name) + return self._request_device_path(volume_uuid, volume_name) + raise LinstorVolumeManagerError( + 'Empty dev path for `{}`, but definition "seems" to exist' + .format(volume_uuid) + ) + # Contains a path of the /dev/drbd form. + return resources[0].resources[0].volumes[0].device_path + + raise LinstorVolumeManagerError( + 'Unable to get volume dev path `{}`: {}'.format( + volume_uuid, str(resources[0]) + ) + ) + + def _activate_device_path(self, node_name, volume_name): + result = self._linstor.resource_create([ + linstor.ResourceData(node_name, volume_name, diskless=True) + ]) + if linstor.Linstor.all_api_responses_no_error(result): + return + errors = linstor.Linstor.filter_api_call_response_errors(result) + if len(errors) == 1 and errors[0].is_error( + linstor.consts.FAIL_EXISTS_RSC + ): + return + + raise LinstorVolumeManagerError( + 'Unable to activate device path of `{}` on node `{}`: {}' + .format(volume_name, node_name, ', '.join( + [str(x) for x in result])) + ) + + def _destroy_resource(self, resource_name): + result = self._linstor.resource_dfn_delete(resource_name) + error_str = self._get_error_str(result) + if error_str: + raise LinstorVolumeManagerError( + 'Could not destroy resource `{}` from SR `{}`: {}' + .format(resource_name, self._group_name, error_str) + ) + + def _destroy_volume(self, volume_uuid, volume_properties): + assert volume_properties.namespace == \ + self._build_volume_namespace(volume_uuid) + + try: + volume_name = volume_properties.get(self.PROP_VOLUME_NAME) + if volume_name in self._fetch_resource_names(): + self._destroy_resource(volume_name) + + # Assume this call is atomic. + volume_properties.clear() + except Exception as e: + raise LinstorVolumeManagerError( + 'Cannot destroy volume `{}`: {}'.format(volume_uuid, e) + ) + + def _force_destroy_volume(self, volume_uuid, volume_properties): + try: + self._destroy_volume(volume_uuid, volume_properties) + except Exception as e: + self._logger('Ignore fail: {}'.format(e)) + + def _build_volumes(self, repair): + properties = linstor.KV( + self._get_store_name(), + uri=self._uri, + namespace=self._build_volume_namespace() + ) + + resource_names = self._fetch_resource_names() + + self._volumes = set() + + updating_uuid_volumes = self._get_volumes_by_property( + self.REG_UPDATING_UUID_SRC, ignore_inexisting_volumes=False + ) + if updating_uuid_volumes and not repair: + raise LinstorVolumeManagerError( + 'Cannot build LINSTOR volume list: ' + 'It exists invalid "updating uuid volumes", repair is required' + ) + + existing_volumes = self._get_volumes_by_property( + self.REG_NOT_EXISTS, ignore_inexisting_volumes=False + ) + for volume_uuid, not_exists in existing_volumes.items(): + properties.namespace = self._build_volume_namespace( + volume_uuid + ) + + src_uuid = properties.get(self.PROP_UPDATING_UUID_SRC) + if src_uuid: + self._logger( + 'Ignoring volume during manager initialization with prop ' + ' PROP_UPDATING_UUID_SRC: {} (properties={})' + .format( + volume_uuid, + self._get_filtered_properties(properties) + ) + ) + continue + + # Insert volume in list if the volume exists. Or if the volume + # is being created and a slave wants to use it (repair = False). + # + # If we are on the master and if repair is True and state is + # Creating, it's probably a bug or crash: the creation process has + # been stopped. + if not_exists == self.STATE_EXISTS or ( + not repair and not_exists == self.STATE_CREATING + ): + self._volumes.add(volume_uuid) + continue + + if not repair: + self._logger( + 'Ignoring bad volume during manager initialization: {} ' + '(properties={})'.format( + volume_uuid, + self._get_filtered_properties(properties) + ) + ) + continue + + # Remove bad volume. + try: + self._logger( + 'Removing bad volume during manager initialization: {} ' + '(properties={})'.format( + volume_uuid, + self._get_filtered_properties(properties) + ) + ) + volume_name = properties.get(self.PROP_VOLUME_NAME) + + # Little optimization, don't call `self._destroy_volume`, + # we already have resource name list. + if volume_name in resource_names: + self._destroy_resource(volume_name) + + # Assume this call is atomic. + properties.clear() + except Exception as e: + # Do not raise, we don't want to block user action. + self._logger( + 'Cannot clean volume {}: {}'.format(volume_uuid, e) + ) + + for dest_uuid, src_uuid in updating_uuid_volumes.items(): + dest_properties = self._get_volume_properties(dest_uuid) + if int(dest_properties.get(self.PROP_NOT_EXISTS) or + self.STATE_EXISTS): + dest_properties.clear() + continue + + src_properties = self._get_volume_properties(src_uuid) + src_properties.clear() + + dest_properties.pop(self.PROP_UPDATING_UUID_SRC) + + if src_uuid in self._volumes: + self._volumes.remove(src_uuid) + self._volumes.add(dest_uuid) + + def _get_sr_properties(self): + return linstor.KV( + self._get_store_name(), + uri=self._uri, + namespace=self._build_sr_namespace() + ) + + def _get_volumes_by_property( + self, reg_prop, ignore_inexisting_volumes=True + ): + base_properties = linstor.KV( + self._get_store_name(), + uri=self._uri, + namespace=self._build_volume_namespace() + ) + + volume_properties = {} + for volume_uuid in self._volumes: + volume_properties[volume_uuid] = '' + + for key, value in base_properties.items(): + res = reg_prop.match(key) + if res: + volume_uuid = res.groups()[0] + if not ignore_inexisting_volumes or \ + volume_uuid in self._volumes: + volume_properties[volume_uuid] = value + + return volume_properties + + def _get_volume_properties(self, volume_uuid): + return linstor.KV( + self._get_store_name(), + uri=self._uri, + namespace=self._build_volume_namespace(volume_uuid) + ) + + def _get_store_name(self): + return 'xcp-sr-{}'.format(self._group_name) + + @classmethod + def _build_sr_namespace(cls): + return '/{}/'.format(cls.NAMESPACE_SR) + + @classmethod + def _build_volume_namespace(cls, volume_uuid=None): + # Return a path to all volumes if `volume_uuid` is not given. + if volume_uuid is None: + return '/{}/'.format(cls.NAMESPACE_VOLUME) + return '/{}/{}/'.format(cls.NAMESPACE_VOLUME, volume_uuid) + + @classmethod + def _get_error_str(cls, result): + return ', '.join([ + err.message for err in cls._filter_errors(result) + ]) + + @classmethod + def _create_linstor_instance(cls, uri): + def connect(): + instance = linstor.Linstor(uri, keep_alive=True) + instance.connect() + return instance + + return util.retry( + connect, + maxretry=60, + exceptions=[linstor.errors.LinstorNetworkError] + ) + + @classmethod + def _destroy_storage_pool(cls, lin, group_name, node_name): + result = lin.storage_pool_delete(node_name, group_name) + error_str = cls._get_error_str(result) + if error_str: + raise LinstorVolumeManagerError( + 'Failed to destroy SP `{}` on node `{}`: {}'.format( + group_name, + node_name, + error_str + ) + ) + + @classmethod + def _destroy_resource_group(cls, lin, group_name): + result = lin.resource_group_delete(group_name) + error_str = cls._get_error_str(result) + if error_str: + raise LinstorVolumeManagerError( + 'Failed to destroy RG `{}`: {}'.format(group_name, error_str) + ) + + @classmethod + def _build_group_name(cls, base_name): + # If thin provisioning is used we have a path like this: + # `VG/LV`. "/" is not accepted by LINSTOR. + return '{}{}'.format(cls.PREFIX_SR, base_name.replace('/', '_')) + + @staticmethod + def _get_filtered_properties(properties): + return dict(properties.items()) + + @staticmethod + def _filter_errors(result): + return [ + err for err in result + if hasattr(err, 'is_error') and err.is_error() + ] + + @staticmethod + def _check_errors(result, codes): + for err in result: + for code in codes: + if err.is_error(code): + return True + return False diff --git a/drivers/tapdisk-pause b/drivers/tapdisk-pause index 59368696..ed6abede 100755 --- a/drivers/tapdisk-pause +++ b/drivers/tapdisk-pause @@ -29,6 +29,12 @@ import lvhdutil import vhdutil import lvmcache +try: + from linstorvolumemanager import LinstorVolumeManager + LINSTOR_AVAILABLE = True +except ImportError: + LINSTOR_AVAILABLE = False + TAPDEV_BACKPATH_PFX = "/dev/sm/backend" TAPDEV_PHYPATH_PFX = "/dev/sm/phy" @@ -73,24 +79,6 @@ def _mkphylink(sr_uuid, vdi_uuid, path): util.pread2(cmd) return path -def _pathRefresh(): - # LVM rename check - realpath = os.path.realpath(self.phypath) - phypath = vdi_type = None - util.SMlog("Realpath: %s" % realpath) - if realpath.startswith("/dev/VG_XenStorage-") and \ - not os.path.exists(realpath): - util.SMlog("Path inconsistent") - pfx = "/dev/VG_XenStorage-%s/" % self.sr_uuid - for ty in ["LV","VHD"]: - p = pfx + ty + "-" + self.vdi_uuid - util.SMlog("Testing path: %s" % p) - if os.path.exists(p): - _mkphylink(self.sr_uuid, self.vdi_uuid, p) - phypath = p - if ty == "LV": vdi_type = "aio" - else: vdi_type = "vhd" - def tapPause(session, args): tap = Tapdisk(session, args) return tap.Pause() @@ -148,7 +136,51 @@ class Tapdisk: self.realpath = p if ty == "LV": self.vdi_type = "aio" else: self.vdi_type = "vhd" - + elif realpath.startswith('/dev/drbd/by-res/xcp-volume-'): + if not LINSTOR_AVAILABLE: + raise util.SMException( + 'Can\'t refresh tapdisk: LINSTOR libraries are missing' + ) + + # We must always recreate the symlink to ensure we have + # the right info. Why? Because if the volume UUID is changed in + # LINSTOR the symlink is not directly updated. When live leaf + # coalesce is executed we have these steps: + # "A" -> "OLD_A" + # "B" -> "A" + # Without symlink update the previous "A" path is reused instead of + # "B" path. Note: "A", "B" and "OLD_A" are UUIDs. + session = self.session + + linstor_uri = 'linstor://{}'.format( + util.get_master_rec(session)['address'] + ) + + host_ref = util.get_this_host_ref(session) + sr_ref = session.xenapi.SR.get_by_uuid(self.sr_uuid) + + pbd = util.find_my_pbd(session, host_ref, sr_ref) + if pbd is None: + raise util.SMException('Failed to find PBD') + + dconf = session.xenapi.PBD.get_device_config(pbd) + group_name = dconf['group-name'] + + device_path = LinstorVolumeManager( + linstor_uri, + group_name, + logger=util.SMlog + ).get_device_path(self.vdi_uuid) + + if realpath != device_path: + util.SMlog( + 'Update LINSTOR PhyLink (previous={}, current={})' + .format(realpath, device_path) + ) + os.unlink(self.phypath) + _mkphylink(self.sr_uuid, self.vdi_uuid, device_path) + self.realpath = device_path + @locking("VDIUnavailable") def Pause(self): util.SMlog("Pause for %s" % self.vdi_uuid) diff --git a/drivers/util.py b/drivers/util.py index 2353f4b2..f6739373 100755 --- a/drivers/util.py +++ b/drivers/util.py @@ -651,10 +651,36 @@ def get_this_host(): f.close() return uuid -def is_master(session): + +def get_master_ref(session): pools = session.xenapi.pool.get_all() - master = session.xenapi.pool.get_master(pools[0]) - return get_this_host_ref(session) == master + return session.xenapi.pool.get_master(pools[0]) + + +def get_master_rec(session): + return session.xenapi.host.get_record(get_master_ref(session)) + + +def is_master(session): + return get_this_host_ref(session) == get_master_ref(session) + + +def get_master_address(): + address = None + try: + fd = open('/etc/xensource/pool.conf', 'r') + try: + items = fd.readline().split(':') + if items[0].strip() == 'master': + address = 'localhost' + else: + address = items[1].strip() + finally: + fd.close() + except Exception: + pass + return address + # XXX: this function doesn't do what it claims to do def get_localhost_uuid(session): @@ -1384,13 +1410,21 @@ def findRunningProcessOrOpenFile(name, process = True): else: return (retVal, processandpids) -def retry(f, maxretry=20, period=3): +def retry(f, maxretry=20, period=3, exceptions=[Exception]): retries = 0 while True: try: return f() - except Exception, e: - SMlog("Got exception: %s. Retry number: %s" % (str(e),retries)) + except Exception as e: + for exception in exceptions: + if isinstance(e, exception): + SMlog('Got exception: {}. Retry number: {}'.format( + str(e), retries + )) + break + else: + SMlog('Got bad exception: {}. Raising...'.format(e)) + raise e retries += 1 if retries >= maxretry: diff --git a/linstor/Makefile b/linstor/Makefile new file mode 100644 index 00000000..c329ca30 --- /dev/null +++ b/linstor/Makefile @@ -0,0 +1,22 @@ +PREFIX ?= /opt/xensource/libexec +DESTDIR ?= +DEBUGDIR ?= /opt/xensource/debug + + +OPTS := -Wall -std=gnu99 + +SRC := linstor-monitord.c + +BIN := linstor-monitord + +all: daemon + +daemon: linstor-monitord.c + $(CC) $(OPTS) $(SRC) -o $(BIN) + +install: linstor-monitord + mkdir -p $(DESTDIR)$(PREFIX) + install -m 755 $^ $(DESTDIR)$(PREFIX) + +clean: + rm -f linstor-monitord diff --git a/linstor/linstor-monitord.c b/linstor/linstor-monitord.c new file mode 100644 index 00000000..8161813d --- /dev/null +++ b/linstor/linstor-monitord.c @@ -0,0 +1,402 @@ +/* + * Copyright (C) 2020 Vates SAS - ronan.abhamon@vates.fr + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// TODO: Handle new hosts. +// TODO: https://github.com/xcp-ng/xcp/issues/421 + +// ============================================================================= + +#define POOL_CONF_DIR "/etc/xensource" +#define POOL_CONF_FILE "pool.conf" +#define POOL_CONF_ABS_FILE POOL_CONF_DIR "/" POOL_CONF_FILE + +// In milliseconds. +#define POLL_TIMEOUT 2000 + +// ----------------------------------------------------------------------------- + +static inline void normalizeTime (struct timespec *spec) { + while (spec->tv_nsec >= 1000000000) { + ++spec->tv_sec; + spec->tv_nsec -= 1000000000; + } + while (spec->tv_nsec < 0) { + --spec->tv_sec; + spec->tv_nsec += 1000000000; + } +} + +static inline struct timespec getCurrentTime () { + struct timespec spec; + clock_gettime(CLOCK_MONOTONIC, &spec); + return (struct timespec){ + .tv_sec = spec.tv_sec, + .tv_nsec = spec.tv_nsec + }; +} + +static inline struct timespec getTimeDiff (const struct timespec *a, const struct timespec *b) { + struct timespec result = *a; + result.tv_sec -= b->tv_sec - 1; + result.tv_nsec -= b->tv_nsec + 1000000000; + normalizeTime(&result); + return result; +} + +static inline int64_t convertToMilliseconds (struct timespec spec) { + spec.tv_nsec += 1000 - spec.tv_nsec % 1000; + normalizeTime(&spec); + return spec.tv_sec * 1000 + spec.tv_nsec / 1000000; +} + +// ----------------------------------------------------------------------------- + +static inline int readPoolConf (char *buffer, size_t bufferSize) { + FILE *f = fopen(POOL_CONF_ABS_FILE, "r"); + if (!f) { + syslog(LOG_ERR, "Failed to open `" POOL_CONF_ABS_FILE "`: `%s`.", strerror(errno)); + return -errno; + } + + int ret = 0; + if (!fgets(buffer, bufferSize, f)) { + syslog(LOG_ERR, "Cannot read `" POOL_CONF_ABS_FILE "`."); + ret = -EIO; + } + + fclose(f); + + return ret; +} + +static inline int isMasterHost (int *error) { + if (error) + *error = 0; + + char buffer[512]; + + int ret = readPoolConf(buffer, sizeof buffer); + if (ret < 0) { + if (error) + *error = ret; + return 0; + } + + static const char masterStr[] = "master"; + static const size_t masterLen = sizeof masterStr - 1; + if (!strncmp(buffer, masterStr, masterLen)) { + const char end = buffer[masterLen]; + ret = end == '\0' || isspace(end); + } + + if (ret < 0) { + if (error) + *error = ret; + return 0; + } + + return ret; +} + +// ----------------------------------------------------------------------------- + +typedef struct { + int inotifyFd; + // TODO: Should be completed with at least a hostname field. +} State; + +// ----------------------------------------------------------------------------- + +static inline int execCommand (char *argv[]) { + const pid_t pid = fork(); + if (pid < 0) + return -errno; + + // Child process. + if (pid == 0) { + if (execvp(*argv, argv) < 0) + syslog(LOG_ERR, "Failed to exec `%s` command.", *argv); + exit(EXIT_FAILURE); + } + + // Main process. + int status; + if (waitpid(pid, &status, 0) < 0) { + syslog(LOG_ERR, "Failed to wait command: `%s`.", *argv); + return -errno; + } + + if (WIFEXITED(status)) { + const int code = WEXITSTATUS(status); + if (code == 0) + syslog(LOG_INFO, "`%s` completed normally.", *argv); + else + syslog(LOG_ERR, "`%s` exited with an error: %d.", *argv, code); + } else if (WIFSIGNALED(status)) + syslog(LOG_ERR, "`%s` terminated by signal %d.", *argv, WTERMSIG(status)); + + return 0; +} + +// ----------------------------------------------------------------------------- + +static inline int createInotifyInstance () { + const int fd = inotify_init1(IN_CLOEXEC); + if (fd < 0) { + syslog(LOG_ERR, "Unable to create inotify instance: `%s`.", strerror(errno)); + return -errno; + } + return fd; +} + +static inline int addInotifyWatch (int inotifyFd, const char *filepath, uint32_t mask) { + const int wd = inotify_add_watch(inotifyFd, filepath, mask); + if (wd < 0) { + syslog(LOG_ERR, "Unable to register `%s`: `%s`.", filepath, strerror(errno)); + return -errno; + } + return wd; +} + +// ----------------------------------------------------------------------------- + +static inline int updateLinstorServices () { + int error; + const int isMaster = isMasterHost(&error); + if (error) + return error; + + syslog(LOG_INFO, "%s linstor-controller...", isMaster ? "Enabling" : "Disabling"); + char *argv[] = { + "systemctl", + isMaster ? "enable" : "disable", + "--now", + "linstor-controller", + NULL + }; + return execCommand(argv); +} + +static inline int updateLinstorNode (State *state) { + char buffer[256]; + if (gethostname(buffer, sizeof buffer) == -1) { + syslog(LOG_ERR, "Failed to get hostname: `%s`.", strerror(errno)); + return errno ? -errno : -EINVAL; + } + + // TODO: Finish me, see: https://github.com/xcp-ng/xcp/issues/421 + + return 0; +} + +// ----------------------------------------------------------------------------- + +#define PROCESS_MODE_DEFAULT 0 +#define PROCESS_MODE_WAIT_FILE_CREATION 1 + +static inline int waitForPoolConfCreation (State *state, int *wdFile); + +static inline int processPoolConfEvents (int inotifyFd, int wd, char **buffer, size_t *bufferSize, int mode, int *process) { + size_t size = 0; + if (ioctl(inotifyFd, FIONREAD, (char *)&size) == -1) { + syslog(LOG_ERR, "Failed to get buffer size from inotify descriptor: `%s`.", strerror(errno)); + return -errno; + } + + if (*bufferSize < size) { + void *ptr = realloc(*buffer, size); + if (!ptr) { + syslog(LOG_ERR, "Failed to reallocate buffer with size %zu: `%s`.", size, strerror(errno)); + return -errno; + } + *buffer = ptr; + *bufferSize = size; + } + + if ((size = (size_t)read(inotifyFd, *buffer, size)) == (size_t)-1) { + syslog(LOG_ERR, "Failed to read buffer from inotify descriptor: `%s`.", strerror(errno)); + return -errno; + } + + uint32_t mask = 0; + for (char *p = *buffer, *end = p + size; p < end; ) { + const struct inotify_event *event = (struct inotify_event *)p; + + if (event->mask & IN_Q_OVERFLOW) + syslog(LOG_WARNING, "Event queue overflow."); + + if (event->wd == wd) { + if (event->len) { + // Event in the watched directory. + if (!strncmp(event->name, POOL_CONF_FILE, event->len)) + mask |= event->mask; + } else { + // Directory or watched file event. + if (mode == PROCESS_MODE_DEFAULT) + mask |= event->mask; + else if (event->mask & (IN_DELETE_SELF | IN_MOVE_SELF | IN_UNMOUNT)) { + syslog(LOG_ERR, "Watched `" POOL_CONF_DIR "` dir has been removed!"); + return -EIO; // The process should be exited after that. + } + } + } + + p += sizeof(struct inotify_event) + event->len; + } + + int ret = 0; + if (mode == PROCESS_MODE_DEFAULT) { + if (!mask) + return 0; + + syslog(LOG_INFO, "Updating linstor services... (Inotify mask=%" PRIu32 ")", mask); + if (mask & (IN_DELETE_SELF | IN_MOVE_SELF | IN_UNMOUNT)) { + syslog(LOG_ERR, "Watched `" POOL_CONF_ABS_FILE "` file has been removed!"); + inotify_rm_watch(inotifyFd, wd); // Do not forget to remove watch to avoid leaks. + return -EIO; + } + ret = updateLinstorServices(); + } else { + if (mask & (IN_CREATE | IN_MOVED_TO)) { + syslog(LOG_ERR, "Watched `" POOL_CONF_ABS_FILE "` file has been recreated!"); + *process = 0; + } + } + + return ret; +} + +static inline int waitAndProcessEvents (State *state, int wd, int mode) { + char *buffer = NULL; + size_t bufferSize = 0; + + int ret = 0; + int process = 1; + + struct timespec previousTime = getCurrentTime(); + do { + struct timespec currentTime = getCurrentTime(); + const int64_t elapsedTime = convertToMilliseconds(getTimeDiff(¤tTime, &previousTime)); + + int timeout; + if (elapsedTime >= POLL_TIMEOUT) { + updateLinstorNode(state); + timeout = POLL_TIMEOUT; + previousTime = getCurrentTime(); + } else { + timeout = POLL_TIMEOUT - elapsedTime; + } + + struct pollfd fds = { state->inotifyFd, POLLIN, 0 }; + const int res = poll(&fds, 1, timeout); + if (res < 0) { + if (errno == EAGAIN) + continue; + syslog(LOG_ERR, "Failed to poll from inotify descriptor: `%s`.", strerror(errno)); + ret = -errno; + } else if (res > 0) { + ret = processPoolConfEvents(state->inotifyFd, wd, &buffer, &bufferSize, mode, &process); + } + } while (ret >= 0 && process); + + free(buffer); + return ret; +} + +static inline int waitAndProcessFileEvents (State *state, int wd) { + return waitAndProcessEvents(state, wd, PROCESS_MODE_DEFAULT); +} + +static inline int waitAndProcessDirEvents (State *state, int wd) { + return waitAndProcessEvents(state, wd, PROCESS_MODE_WAIT_FILE_CREATION); +} + +static inline int waitForPoolConfCreation (State *state, int *wdFile) { + const int wdDir = addInotifyWatch( + state->inotifyFd, POOL_CONF_DIR, IN_MOVED_TO | IN_CREATE | IN_MOVE_SELF | IN_DELETE_SELF + ); + if (wdDir < 0) + return wdDir; + + int ret = 0; + do { + do { + // Update LINSTOR services... + ret = updateLinstorServices(); + + // Ok we can't read the pool configuration file. + // Maybe the file doesn't exist. Waiting its creation... + } while ((ret == -ENOENT || ret == -EIO) && !(ret = waitAndProcessDirEvents(state, wdDir))); + + // The services have been updated, now we must add a new watch on the pool config file directly. + if (!ret) { + *wdFile = addInotifyWatch(state->inotifyFd, POOL_CONF_ABS_FILE, IN_MODIFY | IN_MOVE_SELF | IN_DELETE_SELF); + if (*wdFile < 0) + ret = *wdFile; + } + } while (ret == -ENOENT); + + inotify_rm_watch(state->inotifyFd, wdDir); + return ret; +} + +// ----------------------------------------------------------------------------- + +int main (int argc, char *argv[]) { + (void)argc; + (void)argv; + + openlog(argv[0], LOG_PID, LOG_USER | LOG_MAIL); + setlogmask(LOG_UPTO(LOG_INFO)); + + State state = { + .inotifyFd = -1 + }; + + const int inotifyFd = createInotifyInstance(); + if (inotifyFd < 0) + return -inotifyFd; + state.inotifyFd = inotifyFd; + + updateLinstorNode(&state); + + int ret = 0; + while (!ret || ret == -ENOENT || ret == -EIO) { + int wdFile; + if ((ret = waitForPoolConfCreation(&state, &wdFile)) < 0) + break; // If the pool config dir cannot be watched or accessed, we consider it is a fatal error. + + ret = waitAndProcessFileEvents(&state, wdFile); + } + + close(inotifyFd); + return -ret; +} diff --git a/systemd/linstor-monitor.service b/systemd/linstor-monitor.service new file mode 100644 index 00000000..5f8f0a76 --- /dev/null +++ b/systemd/linstor-monitor.service @@ -0,0 +1,13 @@ +[Unit] +Description=LINSTOR Monitor +Before=xs-sm.service +ConditionPathExists=/usr/share/linstor-server/bin/Controller + +[Service] +StandardOutput=null +StandardError=journal +ExecStart=/opt/xensource/libexec/linstor-monitord +Restart=always + +[Install] +WantedBy=multi-user.target diff --git a/tests/mocks/linstor/__init__.py b/tests/mocks/linstor/__init__.py new file mode 100644 index 00000000..e69de29b From d64ac066354c0bafae04385a7a8ef98044e82032 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Tue, 27 Oct 2020 15:04:36 +0100 Subject: [PATCH 008/133] feat(tests): add unit tests concerning ZFS (close xcp-ng/xcp#425) - Check if "create" doesn't succeed without zfs packages - Check if "scan" failed if the path is not mounted (not a ZFS mountpoint) --- drivers/ZFSSR.py | 32 ++++++------ tests/test_ZFSSR.py | 115 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 131 insertions(+), 16 deletions(-) create mode 100644 tests/test_ZFSSR.py diff --git a/drivers/ZFSSR.py b/drivers/ZFSSR.py index 1b2f398f..d3752101 100644 --- a/drivers/ZFSSR.py +++ b/drivers/ZFSSR.py @@ -58,6 +58,18 @@ } +def is_zfs_available(): + import distutils.spawn + return distutils.spawn.find_executable('zfs') and \ + util.pathexists('/sys/module/zfs/initstate') + + +def is_zfs_path(path): + cmd = ['findmnt', '-o', 'FSTYPE', '-n', path] + fs_type = util.pread2(cmd).split('\n')[0] + return fs_type == 'zfs' + + class ZFSSR(FileSR.FileSR): DRIVER_TYPE = 'zfs' @@ -66,7 +78,7 @@ def handles(type): return type == ZFSSR.DRIVER_TYPE def load(self, sr_uuid): - if not self._is_zfs_available(): + if not is_zfs_available(): raise xs_errors.XenError( 'SRUnavailable', opterr='zfs is not installed or module is not loaded' @@ -74,7 +86,7 @@ def load(self, sr_uuid): return super(ZFSSR, self).load(sr_uuid) def create(self, sr_uuid, size): - if not self._is_zfs_path(self.remotepath): + if not is_zfs_path(self.remotepath): raise xs_errors.XenError( 'ZFSSRCreate', opterr='Cannot create SR, path is not a ZFS mountpoint' @@ -90,7 +102,7 @@ def delete(self, sr_uuid): return super(ZFSSR, self).delete(sr_uuid) def attach(self, sr_uuid): - if not self._is_zfs_path(self.remotepath): + if not is_zfs_path(self.remotepath): raise xs_errors.XenError( 'SRUnavailable', opterr='Invalid ZFS path' @@ -106,19 +118,7 @@ def vdi(self, uuid, loadLocked=False): # Ensure _checkmount is overridden to prevent bad behaviors in FileSR. def _checkmount(self): return super(ZFSSR, self)._checkmount() and \ - self._is_zfs_path(self.remotepath) - - @staticmethod - def _is_zfs_path(path): - cmd = ['findmnt', '-o', 'FSTYPE', '-n', path] - fs_type = util.pread2(cmd).split('\n')[0] - return fs_type == 'zfs' - - @staticmethod - def _is_zfs_available(): - import distutils.spawn - return distutils.spawn.find_executable('zfs') and \ - util.pathexists('/sys/module/zfs/initstate') + is_zfs_path(self.remotepath) class ZFSFileVDI(FileSR.FileVDI): diff --git a/tests/test_ZFSSR.py b/tests/test_ZFSSR.py new file mode 100644 index 00000000..6f8040dc --- /dev/null +++ b/tests/test_ZFSSR.py @@ -0,0 +1,115 @@ +import FileSR +import mock +import os +import SR +import unittest +import ZFSSR + + +XML_DEFS = os.path.dirname(os.path.abspath(__file__)) + \ + '/../drivers/XE_SR_ERRORCODES.xml' + + +class FakeZFSSR(ZFSSR.ZFSSR): + uuid = None + sr_ref = None + session = None + srcmd = None + other_config = {} + vdis = {} + passthrough = True + + def __init__(self, srcmd, none): + self.dconf = srcmd.dconf + self.srcmd = srcmd + + +class TestZFSSR(unittest.TestCase): + def create_zfs_sr(self, sr_uuid='asr_uuid', location='fake_path'): + srcmd = mock.Mock() + srcmd.dconf = { + 'location': location + } + srcmd.params = { + 'command': 'some_command', + 'device_config': {} + } + sr = FakeZFSSR(srcmd, None) + sr.load(sr_uuid) + return sr + + @mock.patch('ZFSSR.is_zfs_available', autospec=True) + @mock.patch('FileSR.Lock', autospec=True) + def test_load(self, lock, is_zfs_available): + self.create_zfs_sr() + + @mock.patch('xs_errors.XML_DEFS', new=XML_DEFS) + def test_load_with_zfs_unavailable(self): + failed = False + try: + self.create_zfs_sr() + except SR.SROSError as e: + # Check SRUnavailable error. + self.assertTrue(e.errno == 47) + failed = True + self.assertTrue(failed) + + @mock.patch('ZFSSR.is_zfs_available', autospec=True) + @mock.patch('ZFSSR.is_zfs_path', autospec=True) + @mock.patch('FileSR.Lock', autospec=True) + def test_create(self, lock, is_zfs_path, is_zfs_available): + sr = self.create_zfs_sr() + sr.create(sr.uuid, 42) + + @mock.patch('ZFSSR.is_zfs_available', autospec=True) + @mock.patch('ZFSSR.is_zfs_path', autospec=True) + @mock.patch('FileSR.Lock', autospec=True) + @mock.patch('xs_errors.XML_DEFS', new=XML_DEFS) + def test_create_with_invalid_zfs_path( + self, lock, is_zfs_path, is_zfs_available + ): + failed = False + + is_zfs_path.return_value = False + sr = self.create_zfs_sr() + try: + sr.create(sr.uuid, 42) + except SR.SROSError as e: + # Check ZFSSRCreate error. + self.assertTrue(e.errno == 5000) + failed = True + self.assertTrue(failed) + + @mock.patch('ZFSSR.is_zfs_available', autospec=True) + @mock.patch('ZFSSR.is_zfs_path', autospec=True) + @mock.patch('FileSR.Lock', autospec=True) + @mock.patch('FileSR.FileSR._checkmount', autospec=True) + @mock.patch('FileSR.FileSR._loadvdis', autospec=True) + @mock.patch('SR.SR.scan', autospec=True) + @mock.patch('os.path.ismount', autospec=True) + def test_scan( + self, ismount, scan, _loadvdis, _checkmount, lock, + is_zfs_path, is_zfs_available + ): + sr = self.create_zfs_sr() + sr.scan(sr.uuid) + + @mock.patch('ZFSSR.is_zfs_available', autospec=True) + @mock.patch('ZFSSR.is_zfs_path', autospec=True) + @mock.patch('FileSR.Lock', autospec=True) + @mock.patch('FileSR.FileSR._checkmount', autospec=True) + @mock.patch('xs_errors.XML_DEFS', new=XML_DEFS) + def test_scan_with_invalid_zfs_path( + self, _checkmount, lock, is_zfs_path, is_zfs_available + ): + failed = False + + is_zfs_path.return_value = False + sr = self.create_zfs_sr() + try: + sr.scan(sr.uuid) + except SR.SROSError as e: + # Check SRUnavailable error. + self.assertTrue(e.errno == 47) + failed = True + self.assertTrue(failed) From 68e67e63a4f20f84a7482d0fea7d06bdfaa003b9 Mon Sep 17 00:00:00 2001 From: BenjiReis Date: Thu, 25 Feb 2021 09:54:52 +0100 Subject: [PATCH 009/133] If no NFS ACLs provided, assume everyone: Some QNAP devices do not provide ACL when fetching NFS mounts. In this case the assumed ACL should be: "*". This commit fixes the crash when attempting to access the non existing ACL. Relevant issues: - https://github.com/xapi-project/sm/issues/511 - https://github.com/xcp-ng/xcp/issues/113 --- drivers/nfs.py | 6 +++++- tests/test_nfs.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/drivers/nfs.py b/drivers/nfs.py index a40b8eda..c06cfc4c 100644 --- a/drivers/nfs.py +++ b/drivers/nfs.py @@ -202,7 +202,11 @@ def scan_exports(target): textnode = dom.createTextNode(target) subentry.appendChild(textnode) - (path, access) = val.split() + # Access is not always provided by showmount return + # If none is provided we need to assume "*" + array = val.split() + path = array[0] + access = array[1] if len(array) >= 2 else "*" subentry = dom.createElement("Path") entry.appendChild(subentry) textnode = dom.createTextNode(path) diff --git a/tests/test_nfs.py b/tests/test_nfs.py index 71800ab0..cef414fe 100644 --- a/tests/test_nfs.py +++ b/tests/test_nfs.py @@ -140,3 +140,33 @@ def test_validate_nfsversion_valid(self): for thenfsversion in ['3', '4', '4.1']: self.assertEquals(nfs.validate_nfsversion(thenfsversion), thenfsversion) + + # Can't use autospec due to http://bugs.python.org/issue17826 + @mock.patch('util.pread2') + def test_scan_exports(self, pread2): + pread2.side_effect = ["/srv/nfs\n/srv/nfs2 *\n/srv/nfs3 127.0.0.1/24"] + res = nfs.scan_exports('aServer') + + expected = """ + +\t +\t\taServer +\t\t/srv/nfs +\t\t* +\t +\t +\t\taServer +\t\t/srv/nfs2 +\t\t* +\t +\t +\t\taServer +\t\t/srv/nfs3 +\t\t127.0.0.1/24 +\t + +""" + + self.assertEqual(res.toprettyxml(), expected) + self.assertEqual(len(pread2.mock_calls), 1) + pread2.assert_called_with(['/usr/sbin/showmount', '--no-headers', '-e', 'aServer']) From 769a4b9d253cf14ed6f37ab06c77dcef5c22fd50 Mon Sep 17 00:00:00 2001 From: Aleksander Wieliczko Date: Fri, 29 Jan 2021 15:21:23 +0100 Subject: [PATCH 010/133] Added SM Driver for MooseFS Co-authored-by: Piotr Robert Konopelko Signed-off-by: Aleksander Wieliczko Signed-off-by: Ronan Abhamon --- Makefile | 1 + drivers/MooseFSSR.py | 271 ++++++++++++++++++++++++++++++++++++++++ drivers/cleanup.py | 2 +- tests/test_MooseFSSR.py | 62 +++++++++ 4 files changed, 335 insertions(+), 1 deletion(-) create mode 100755 drivers/MooseFSSR.py create mode 100644 tests/test_MooseFSSR.py diff --git a/Makefile b/Makefile index 609e1828..b0ae3536 100755 --- a/Makefile +++ b/Makefile @@ -23,6 +23,7 @@ SM_DRIVERS += GlusterFS SM_DRIVERS += XFS SM_DRIVERS += ZFS SM_DRIVERS += EXT4 +SM_DRIVERS += MooseFS SM_LIBS := SR SM_LIBS += SRCommand diff --git a/drivers/MooseFSSR.py b/drivers/MooseFSSR.py new file mode 100755 index 00000000..be5112c8 --- /dev/null +++ b/drivers/MooseFSSR.py @@ -0,0 +1,271 @@ +#!/usr/bin/env python +# +# Original work copyright (C) Citrix systems +# Modified work copyright (C) Tappest sp. z o.o., Vates SAS and XCP-ng community +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published +# by the Free Software Foundation; version 2.1 only. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +# +# MooseFSSR: Based on CEPHFSSR and FileSR, mounts MooseFS share + +import errno +import os +import syslog as _syslog +import xmlrpclib +from syslog import syslog + +# careful with the import order here +# FileSR has a circular dependency: +# FileSR -> blktap2 -> lvutil -> EXTSR -> FileSR +# importing in this order seems to avoid triggering the issue. +import SR +import SRCommand +import FileSR +# end of careful +import cleanup +import util +import vhdutil +import xs_errors +from lock import Lock + +CAPABILITIES = ["SR_PROBE", "SR_UPDATE", "SR_CACHING", + "VDI_CREATE", "VDI_DELETE", "VDI_ATTACH", "VDI_DETACH", + "VDI_UPDATE", "VDI_CLONE", "VDI_SNAPSHOT", "VDI_RESIZE", "VDI_MIRROR", + "VDI_GENERATE_CONFIG", + "VDI_RESET_ON_BOOT/2", "ATOMIC_PAUSE"] + +CONFIGURATION = [ + ['masterhost', 'MooseFS Master Server hostname or IP address (required, e.g.: "mfsmaster.local.lan" or "10.10.10.1")'], + ['masterport', 'MooseFS Master Server port, default: 9421'], + ['rootpath', 'MooseFS path (required, e.g.: "/")'], + ['options', 'MooseFS Client additional options (e.g.: "mfspassword=PASSWORD,mfstimeout=300")'] +] + +DRIVER_INFO = { + 'name': 'MooseFS VHD', + 'description': 'SR plugin which stores disks as VHD files on a MooseFS storage', + 'vendor': 'Tappest sp. z o.o.', + 'copyright': '(C) 2021 Tappest sp. z o.o.', + 'driver_version': '1.0', + 'required_api_version': '1.0', + 'capabilities': CAPABILITIES, + 'configuration': CONFIGURATION +} + +DRIVER_CONFIG = {"ATTACH_FROM_CONFIG_WITH_TAPDISK": True} + +# The mountpoint for the directory when performing an sr_probe. All probes +# are guaranteed to be serialised by xapi, so this single mountpoint is fine. +PROBE_MOUNTPOINT = os.path.join(SR.MOUNT_BASE, "probe") + + +class MooseFSException(Exception): + def __init__(self, errstr): + self.errstr = errstr + + +class MooseFSSR(FileSR.FileSR): + """MooseFS file-based storage""" + + DRIVER_TYPE = 'moosefs' + + def handles(sr_type): + # fudge, because the parent class (FileSR) checks for smb to alter its behavior + return sr_type == MooseFSSR.DRIVER_TYPE or sr_type == 'smb' + + handles = staticmethod(handles) + + def load(self, sr_uuid): + if not self._is_moosefs_available(): + raise xs_errors.XenError( + 'SRUnavailable', + opterr='MooseFS Client is not installed!' + ) + + self.ops_exclusive = FileSR.OPS_EXCLUSIVE + self.lock = Lock(vhdutil.LOCK_TYPE_SR, self.uuid) + self.sr_vditype = SR.DEFAULT_TAP + self.driver_config = DRIVER_CONFIG + if 'masterhost' not in self.dconf: + raise xs_errors.XenError('ConfigServerMissing') + self.remoteserver = self.dconf['masterhost'] + self.remotepath = self.dconf['rootpath'] + # if masterport is not specified, use default: 9421 + if 'masterport' not in self.dconf: + self.remoteport = "9421" + else: + self.remoteport = self.dconf['masterport'] + if self.sr_ref and self.session is not None: + self.sm_config = self.session.xenapi.SR.get_sm_config(self.sr_ref) + else: + self.sm_config = self.srcmd.params.get('sr_sm_config') or {} + self.attached = False + self.path = os.path.join(SR.MOUNT_BASE, sr_uuid) + self.mountpoint = self.path + self.linkpath = self.path + self._check_o_direct() + + def checkmount(self): + return util.ioretry(lambda: ((util.pathexists(self.mountpoint) and + util.ismount(self.mountpoint)))) + + def mount(self, mountpoint=None): + """Mount MooseFS share at 'mountpoint'""" + if mountpoint is None: + mountpoint = self.mountpoint + elif not util.is_string(mountpoint) or mountpoint == "": + raise MooseFSException("Mountpoint is not a string object") + + try: + if not util.ioretry(lambda: util.isdir(mountpoint)): + util.ioretry(lambda: util.makedirs(mountpoint)) + except util.CommandException, inst: + raise MooseFSException("Failed to make directory: code is %d" % inst.code) + + try: + options = [] + if self.dconf.has_key('options'): + options.append(self.dconf['options']) + if options: + options = ['-o', ','.join(options)] + command = ["mount", '-t', 'moosefs', self.remoteserver+":"+self.remoteport+":"+self.remotepath, mountpoint] + options + util.ioretry(lambda: util.pread(command), errlist=[errno.EPIPE, errno.EIO], maxretry=2, nofail=True) + except util.CommandException, inst: + syslog(_syslog.LOG_ERR, 'MooseFS mount failed ' + inst.__str__()) + raise MooseFSException("Mount failed with return code %d" % inst.code) + + # Sanity check to ensure that the user has at least RO access to the + # mounted share. Windows sharing and security settings can be tricky. + try: + util.listdir(mountpoint) + except util.CommandException: + try: + self.unmount(mountpoint, True) + except MooseFSException: + util.logException('MooseFSSR.unmount()') + raise MooseFSException("Permission denied. Please check user privileges.") + + def unmount(self, mountpoint, rmmountpoint): + try: + util.pread(["umount", mountpoint]) + except util.CommandException, inst: + raise MooseFSException("Command umount failed with return code %d" % inst.code) + if rmmountpoint: + try: + os.rmdir(mountpoint) + except OSError, inst: + raise MooseFSException("Command rmdir failed with error '%s'" % inst.strerror) + + def attach(self, sr_uuid): + if not self.checkmount(): + try: + self.mount() + except MooseFSException, exc: + raise SR.SROSError(12, exc.errstr) + self.attached = True + + def probe(self): + try: + self.mount(PROBE_MOUNTPOINT) + sr_list = filter(util.match_uuid, util.listdir(PROBE_MOUNTPOINT)) + self.unmount(PROBE_MOUNTPOINT, True) + except (util.CommandException, xs_errors.XenError): + raise + # Create a dictionary from the SR uuids to feed SRtoXML() + sr_dict = {sr_uuid: {} for sr_uuid in sr_list} + return util.SRtoXML(sr_dict) + + def detach(self, sr_uuid): + if not self.checkmount(): + return + util.SMlog("Aborting GC/coalesce") + cleanup.abort(sr_uuid) + # Change directory to avoid unmount conflicts + os.chdir(SR.MOUNT_BASE) + self.unmount(self.mountpoint, True) + self.attached = False + + def create(self, sr_uuid, size): + if self.checkmount(): + raise SR.SROSError(113, 'MooseFS mount point already attached') + + try: + self.mount() + except MooseFSException, exc: + # noinspection PyBroadException + try: + os.rmdir(self.mountpoint) + except: + # we have no recovery strategy + pass + raise SR.SROSError(111, "MooseFS mount error [opterr=%s]" % exc.errstr) + + + def delete(self, sr_uuid): + # try to remove/delete non VDI contents first + super(MooseFSSR, self).delete(sr_uuid) + try: + if self.checkmount(): + self.detach(sr_uuid) + if util.ioretry(lambda: util.pathexists(self.mountpoint)): + util.ioretry(lambda: os.rmdir(self.mountpoint)) + except util.CommandException, inst: + self.detach(sr_uuid) + if inst.code != errno.ENOENT: + raise SR.SROSError(114, "Failed to remove MooseFS mount point") + + def vdi(self, uuid, loadLocked=False): + return MooseFSFileVDI(self, uuid) + + @staticmethod + def _is_moosefs_available(): + import distutils.spawn + return distutils.spawn.find_executable('mfsmount') + +class MooseFSFileVDI(FileSR.FileVDI): + def attach(self, sr_uuid, vdi_uuid): + if not hasattr(self, 'xenstore_data'): + self.xenstore_data = {} + + self.xenstore_data['storage-type'] = MooseFSSR.DRIVER_TYPE + + return super(MooseFSFileVDI, self).attach(sr_uuid, vdi_uuid) + + def generate_config(self, sr_uuid, vdi_uuid): + util.SMlog("MooseFSFileVDI.generate_config") + if not util.pathexists(self.path): + raise xs_errors.XenError('VDIUnavailable') + resp = {'device_config': self.sr.dconf, + 'sr_uuid': sr_uuid, + 'vdi_uuid': vdi_uuid, + 'sr_sm_config': self.sr.sm_config, + 'command': 'vdi_attach_from_config'} + # Return the 'config' encoded within a normal XMLRPC response so that + # we can use the regular response/error parsing code. + config = xmlrpclib.dumps(tuple([resp]), "vdi_attach_from_config") + return xmlrpclib.dumps((config,), "", True) + + def attach_from_config(self, sr_uuid, vdi_uuid): + try: + if not util.pathexists(self.sr.path): + self.sr.attach(sr_uuid) + except: + util.logException("MooseFSFileVDI.attach_from_config") + raise xs_errors.XenError('SRUnavailable', + opterr='Unable to attach from config') + + +if __name__ == '__main__': + SRCommand.run(MooseFSSR, DRIVER_INFO) +else: + SR.registerSR(MooseFSSR) diff --git a/drivers/cleanup.py b/drivers/cleanup.py index a60c9877..b027d882 100755 --- a/drivers/cleanup.py +++ b/drivers/cleanup.py @@ -3123,7 +3123,7 @@ def normalizeType(type): type = SR.TYPE_LVHD if type in [ "ext", "nfs", "ocfsoiscsi", "ocfsohba", "smb", "cephfs", "glusterfs", - "xfs", "zfs", "ext4" + "moosefs", "xfs", "zfs", "ext4" ]: type = SR.TYPE_FILE if type in ["linstor"]: diff --git a/tests/test_MooseFSSR.py b/tests/test_MooseFSSR.py new file mode 100644 index 00000000..5a61cf5e --- /dev/null +++ b/tests/test_MooseFSSR.py @@ -0,0 +1,62 @@ +import mock +import MooseFSSR +import unittest + + +class FakeMooseFSSR(MooseFSSR.MooseFSSR): + uuid = None + sr_ref = None + srcmd = None + other_config = {} + + def __init__(self, srcmd, none): + self.dconf = srcmd.dconf + self.srcmd = srcmd + + +class TestMooseFSSR(unittest.TestCase): + + def create_moosefssr(self, masterhost='aServer', rootpath='/aServerpath', + sr_uuid='asr_uuid'): + srcmd = mock.Mock() + srcmd.dconf = { + 'masterhost': masterhost, + 'rootpath': rootpath + } + srcmd.params = { + 'command': 'some_command', + 'device_config': {} + } + moosefssr = FakeMooseFSSR(srcmd, None) + moosefssr.load(sr_uuid) + return moosefssr + + @mock.patch('MooseFSSR.MooseFSSR._is_moosefs_available', mock.MagicMock(return_value="mfsmount")) + @mock.patch('MooseFSSR.Lock', autospec=True) + def test_load(self, Lock): + self.create_moosefssr() + + @mock.patch('MooseFSSR.MooseFSSR._is_moosefs_available', mock.MagicMock(return_value="mfsmount")) + @mock.patch('MooseFSSR.MooseFSSR.checkmount', autospec=True) + @mock.patch('MooseFSSR.Lock', autospec=True) + def test_attach_if_mounted_then_attached(self, mock_lock, mock_checkmount): + mfssr = self.create_moosefssr() + mock_checkmount.return_value=True + mfssr.attach('asr_uuid') + self.assertTrue(mfssr.attached) + + @mock.patch('MooseFSSR.MooseFSSR._is_moosefs_available', mock.MagicMock(return_value="mfsmount")) + @mock.patch('MooseFSSR.Lock', autospec=True) + def test_mount_mountpoint_empty_string(self, mock_lock): + mfssr = self.create_moosefssr() + self.assertRaises(MooseFSSR.MooseFSException, mfssr.mount) + + @mock.patch('MooseFSSR.MooseFSSR._is_moosefs_available', mock.MagicMock(return_value="mfsmount")) + @mock.patch('MooseFSSR.MooseFSSR.checkmount',return_value=False, autospec=True) + @mock.patch('MooseFSSR.Lock', autospec=True) + def test_detach_not_detached_if_not_mounted(self, mock_lock, mock_checkmount): + mfssr = self.create_moosefssr() + mfssr.attached = True + mock_checkmount.return_value=False + mfssr.detach('asr_uuid') + self.assertTrue(mfssr.attached) From b53271b52d8b940ed6cfdb341431d76947ade1ea Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Thu, 2 Dec 2021 09:28:37 +0100 Subject: [PATCH 011/133] Avoid usage of `umount` in `ISOSR` when `legacy_mode` is used `umount` should not be called when `legacy_mode` is enabled, otherwise a mounted dir used during SR creation is unmounted at the end of the `create` call (and also when a PBD is unplugged) in `detach` block. Signed-off-by: Ronan Abhamon --- drivers/ISOSR.py | 7 +++--- tests/test_ISOSR.py | 59 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 4 deletions(-) diff --git a/drivers/ISOSR.py b/drivers/ISOSR.py index 5b126837..35085487 100755 --- a/drivers/ISOSR.py +++ b/drivers/ISOSR.py @@ -455,10 +455,9 @@ def getCacheOptions(self): def detach(self, sr_uuid): """Std. detach""" - # This handles legacy mode too, so no need to check - if not self._checkmount(): - return - + if 'legacy_mode' in self.dconf or not self._checkmount(): + return + try: util.pread(["umount", self.mountpoint]); except util.CommandException, inst: diff --git a/tests/test_ISOSR.py b/tests/test_ISOSR.py index 914b9616..a5b32903 100644 --- a/tests/test_ISOSR.py +++ b/tests/test_ISOSR.py @@ -19,6 +19,65 @@ def __init__(self, srcmd, none): self.dconf = srcmd.dconf self.srcmd = srcmd +class TestISOSR_overLocal(unittest.TestCase): + def create_isosr(self, location='/local_sr', sr_uuid='asr_uuid'): + srcmd = mock.Mock() + srcmd.dconf = { + 'location': location, + 'type': 'iso', + 'legacy_mode': True + } + srcmd.params = { + 'command': 'some_command' + } + isosr = FakeISOSR(srcmd, None) + isosr.load(sr_uuid) + return isosr + + @mock.patch('util.pread') + def test_load(self, pread): + self.create_isosr() + # Check `mount/umount` is never called. + self.assertFalse(pread.called) + + @mock.patch('os.path.exists', autospec=True) + @mock.patch('util.pread') + def test_attach_and_detach_local(self, pread, exists): + isosr = self.create_isosr() + isosr.attach(None) + self.assertFalse(pread.called) + isosr.detach(None) + self.assertFalse(pread.called) + + @mock.patch('os.path.exists', autospec=True) + @mock.patch('util.pread') + @mock.patch('ISOSR.ISOSR._checkmount') + def test_attach_and_detach_local_with_mounted_path( + self, _checkmount, pread, exists + ): + _checkmount.return_value = True + + isosr = self.create_isosr() + isosr.attach(None) + self.assertFalse(pread.called) + isosr.detach(None) + self.assertFalse(pread.called) + + @testlib.with_context + @mock.patch('os.path.exists') + @mock.patch('util.pread') + def test_attach_local_with_bad_path(self, context, pread, exists): + context.setup_error_codes() + + # Local path doesn't exist, but error list yes. + exists.side_effect = [False, True] + + isosr = self.create_isosr() + with self.assertRaises(SR.SROSError) as ose: + isosr.attach(None) + self.assertEquals(ose.exception.errno, 226) + self.assertFalse(pread.called) + class TestISOSR_overNFS(unittest.TestCase): From 9bccc8731dda635b6fc55d2131e76ea598b10f13 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Wed, 18 May 2022 17:28:09 +0200 Subject: [PATCH 012/133] MooseFS SR uses now UUID subdirs for each SR A sm-config boolean param `subdir` is available to configure where to store the VHDs: - In a subdir with the SR UUID, the new behavior - In the root directory of the MooseFS SR By default, new SRs are created with `subdir` = True. Existing SRs are not modified and continue to use the folder that was given at SR creation, directly, without looking for a subdirectory. Signed-off-by: Ronan Abhamon --- drivers/MooseFSSR.py | 56 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 52 insertions(+), 4 deletions(-) diff --git a/drivers/MooseFSSR.py b/drivers/MooseFSSR.py index be5112c8..ab72f4e9 100755 --- a/drivers/MooseFSSR.py +++ b/drivers/MooseFSSR.py @@ -18,6 +18,7 @@ # # MooseFSSR: Based on CEPHFSSR and FileSR, mounts MooseFS share +import distutils.util import errno import os import syslog as _syslog @@ -99,7 +100,8 @@ def load(self, sr_uuid): if 'masterhost' not in self.dconf: raise xs_errors.XenError('ConfigServerMissing') self.remoteserver = self.dconf['masterhost'] - self.remotepath = self.dconf['rootpath'] + self.rootpath = self.dconf['rootpath'] + self.remotepath = self.rootpath # if masterport is not specified, use default: 9421 if 'masterport' not in self.dconf: self.remoteport = "9421" @@ -109,6 +111,14 @@ def load(self, sr_uuid): self.sm_config = self.session.xenapi.SR.get_sm_config(self.sr_ref) else: self.sm_config = self.srcmd.params.get('sr_sm_config') or {} + + if self.srcmd.cmd != 'sr_create': + self.subdir = distutils.util.strtobool( + self.sm_config.get('subdir') or '0' + ) + if self.subdir: + self.remotepath = os.path.join(self.remotepath, sr_uuid) + self.attached = False self.path = os.path.join(SR.MOUNT_BASE, sr_uuid) self.mountpoint = self.path @@ -138,7 +148,10 @@ def mount(self, mountpoint=None): options.append(self.dconf['options']) if options: options = ['-o', ','.join(options)] - command = ["mount", '-t', 'moosefs', self.remoteserver+":"+self.remoteport+":"+self.remotepath, mountpoint] + options + remote = '{}:{}:{}'.format( + self.remoteserver, self.remoteport, self.remotepath + ) + command = ["mount", '-t', 'moosefs', remote, mountpoint] + options util.ioretry(lambda: util.pread(command), errlist=[errno.EPIPE, errno.EIO], maxretry=2, nofail=True) except util.CommandException, inst: syslog(_syslog.LOG_ERR, 'MooseFS mount failed ' + inst.__str__()) @@ -199,6 +212,7 @@ def create(self, sr_uuid, size): if self.checkmount(): raise SR.SROSError(113, 'MooseFS mount point already attached') + assert self.remotepath == self.rootpath try: self.mount() except MooseFSException, exc: @@ -210,6 +224,33 @@ def create(self, sr_uuid, size): pass raise SR.SROSError(111, "MooseFS mount error [opterr=%s]" % exc.errstr) + try: + self.subdir = self.sm_config.get('subdir') + if self.subdir is None: + self.subdir = True + else: + self.subdir = distutils.util.strtobool(self.subdir) + + self.sm_config['subdir'] = str(self.subdir) + self.session.xenapi.SR.set_sm_config(self.sr_ref, self.sm_config) + + if not self.subdir: + return + + subdir = os.path.join(self.mountpoint, sr_uuid) + if util.ioretry(lambda: util.pathexists(subdir)): + if util.ioretry(lambda: util.isdir(subdir)): + raise xs_errors.XenError('SRExists') + else: + try: + util.ioretry(lambda: util.makedirs(subdir)) + except util.CommandException as e: + if e.code != errno.EEXIST: + raise MooseFSException( + 'Failed to create SR subdir: {}'.format(e) + ) + finally: + self.detach(sr_uuid) def delete(self, sr_uuid): # try to remove/delete non VDI contents first @@ -217,8 +258,15 @@ def delete(self, sr_uuid): try: if self.checkmount(): self.detach(sr_uuid) - if util.ioretry(lambda: util.pathexists(self.mountpoint)): - util.ioretry(lambda: os.rmdir(self.mountpoint)) + + if self.subdir: + # Mount using rootpath () instead of /. + self.remotepath = self.rootpath + self.attach(sr_uuid) + subdir = os.path.join(self.mountpoint, sr_uuid) + if util.ioretry(lambda: util.pathexists(subdir)): + util.ioretry(lambda: os.rmdir(subdir)) + self.detach(sr_uuid) except util.CommandException, inst: self.detach(sr_uuid) if inst.code != errno.ENOENT: From b018cd292425214562687ce11e1cdaec05e0c838 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Thu, 23 Jun 2022 10:36:36 +0200 Subject: [PATCH 013/133] Fix is_open call for many drivers (#25) Ensure all shared drivers are imported in `_is_open` definition to register them in the driver list. Otherwise this function always fails with a SRUnknownType exception. Also, we must add two fake mandatory parameters to make MooseFS happy: `masterhost` and `rootpath`. Same for CephFS with: `serverpath`. (NFS driver is directly patched to ensure there is no usage of the `serverpath` param because its value is equal to None.) `location` param is required to use ZFS, to be more precise, in the parent class: `FileSR`. Signed-off-by: Ronan Abhamon --- drivers/GlusterFSSR.py | 3 ++- drivers/NFSSR.py | 12 ++++++++---- drivers/on_slave.py | 22 ++++++++++++++++++++-- 3 files changed, 30 insertions(+), 7 deletions(-) diff --git a/drivers/GlusterFSSR.py b/drivers/GlusterFSSR.py index a2f7484f..323718d9 100644 --- a/drivers/GlusterFSSR.py +++ b/drivers/GlusterFSSR.py @@ -96,7 +96,8 @@ def load(self, sr_uuid): self.driver_config = DRIVER_CONFIG if 'server' not in self.dconf: raise xs_errors.XenError('ConfigServerMissing') - self.remoteserver = self.dconf['server'] + # Can be None => on-slave plugin hack (is_open function). + self.remoteserver = self.dconf['server'] or '' if self.sr_ref and self.session is not None: self.sm_config = self.session.xenapi.SR.get_sm_config(self.sr_ref) else: diff --git a/drivers/NFSSR.py b/drivers/NFSSR.py index 1fd32b43..e3ab3252 100755 --- a/drivers/NFSSR.py +++ b/drivers/NFSSR.py @@ -83,9 +83,12 @@ def load(self, sr_uuid): self.sm_config = self.srcmd.params.get('sr_sm_config') or {} self.other_config = self.srcmd.params.get('sr_other_config') or {} self.nosubdir = self.sm_config.get('nosubdir') == "true" - if self.dconf.has_key('serverpath'): - self.remotepath = os.path.join(self.dconf['serverpath'], - not self.nosubdir and sr_uuid or "").encode('utf-8') + serverpath = self.dconf.get('serverpath') + if serverpath is not None: + self.remotepath = os.path.join( + serverpath, + not self.nosubdir and sr_uuid or "" + ).encode('utf-8') self.path = os.path.join(SR.MOUNT_BASE, sr_uuid) # Handle optional dconf attributes @@ -100,7 +103,8 @@ def load(self, sr_uuid): def validate_remotepath(self, scan): - if not self.dconf.has_key('serverpath'): + serverpath = self.dconf.get('serverpath') + if serverpath is None: if scan: try: self.scan_exports(self.dconf['server']) diff --git a/drivers/on_slave.py b/drivers/on_slave.py index 0d60d969..534e6c90 100755 --- a/drivers/on_slave.py +++ b/drivers/on_slave.py @@ -72,7 +72,17 @@ def multi(session, args): def _is_open(session, args): """Check if VDI is open by a tapdisk on this host""" - import SRCommand, SR, NFSSR, EXTSR, LVHDSR, blktap2 + import SRCommand + import SR + import CephFSSR + import EXTSR + import GlusterFSSR + import LVHDSR + import MooseFSSR + import NFSSR + import XFSSR + import ZFSSR + import blktap2 util.SMlog("on-slave.is_open: %s" % args) vdiUuid = args["vdiUuid"] @@ -86,7 +96,15 @@ def _is_open(session, args): srType = "lvhd" cmd = SRCommand.SRCommand(None) cmd.driver_info = {"capabilities": None} - cmd.dconf = {"server": None, "device": "/HACK"} + cmd.dconf = { + "server": None, + "device": "/HACK", + # Hack for custom XCP-ng drivers. + "masterhost": None, # MooseFS + "rootpath": None, # MooseFS + "serverpath": None, # CephFS + "location": "/HACK" # ZFS + } cmd.params = {"command": None} driver = SR.driver(srType) From bdb7ced2830833966a033bfd44862068581a0eee Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Thu, 23 Jun 2022 10:37:07 +0200 Subject: [PATCH 014/133] Remove SR_CACHING capability for many SR types (#24) SR_CACHING offers the capacity to use IntelliCache, but this feature is only available using NFS SR. For more details, the implementation of `_setup_cache` in blktap2.py uses only an instance of NFSFileVDI for the shared target. Signed-off-by: Ronan Abhamon --- drivers/CephFSSR.py | 2 +- drivers/GlusterFSSR.py | 2 +- drivers/MooseFSSR.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/CephFSSR.py b/drivers/CephFSSR.py index d7974907..09c928be 100644 --- a/drivers/CephFSSR.py +++ b/drivers/CephFSSR.py @@ -38,7 +38,7 @@ import xs_errors from lock import Lock -CAPABILITIES = ["SR_PROBE", "SR_UPDATE", "SR_CACHING", +CAPABILITIES = ["SR_PROBE", "SR_UPDATE", "VDI_CREATE", "VDI_DELETE", "VDI_ATTACH", "VDI_DETACH", "VDI_UPDATE", "VDI_CLONE", "VDI_SNAPSHOT", "VDI_RESIZE", "VDI_MIRROR", "VDI_GENERATE_CONFIG", diff --git a/drivers/GlusterFSSR.py b/drivers/GlusterFSSR.py index 323718d9..61a7d409 100644 --- a/drivers/GlusterFSSR.py +++ b/drivers/GlusterFSSR.py @@ -35,7 +35,7 @@ import xs_errors from lock import Lock -CAPABILITIES = ["SR_PROBE", "SR_UPDATE", "SR_CACHING", +CAPABILITIES = ["SR_PROBE", "SR_UPDATE", "VDI_CREATE", "VDI_DELETE", "VDI_ATTACH", "VDI_DETACH", "VDI_UPDATE", "VDI_CLONE", "VDI_SNAPSHOT", "VDI_RESIZE", "VDI_MIRROR", "VDI_GENERATE_CONFIG", diff --git a/drivers/MooseFSSR.py b/drivers/MooseFSSR.py index ab72f4e9..212f1ad2 100755 --- a/drivers/MooseFSSR.py +++ b/drivers/MooseFSSR.py @@ -39,7 +39,7 @@ import xs_errors from lock import Lock -CAPABILITIES = ["SR_PROBE", "SR_UPDATE", "SR_CACHING", +CAPABILITIES = ["SR_PROBE", "SR_UPDATE", "VDI_CREATE", "VDI_DELETE", "VDI_ATTACH", "VDI_DETACH", "VDI_UPDATE", "VDI_CLONE", "VDI_SNAPSHOT", "VDI_RESIZE", "VDI_MIRROR", "VDI_GENERATE_CONFIG", From 4de671da3b28a773c312ae5cfb476eb21917209b Mon Sep 17 00:00:00 2001 From: BenjiReis Date: Fri, 4 Aug 2023 12:10:08 +0200 Subject: [PATCH 015/133] Remove `SR_PROBE` from ZFS capabilities (#37) The probe method is not implemented so we shouldn't advertise it. Signed-off-by: BenjiReis --- drivers/ZFSSR.py | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/ZFSSR.py b/drivers/ZFSSR.py index d3752101..b8032117 100644 --- a/drivers/ZFSSR.py +++ b/drivers/ZFSSR.py @@ -23,7 +23,6 @@ import xs_errors CAPABILITIES = [ - 'SR_PROBE', 'SR_UPDATE', 'VDI_CREATE', 'VDI_DELETE', From 0aec61ed673c4d412c3638ce0f111749c2d452ae Mon Sep 17 00:00:00 2001 From: Guillaume Date: Wed, 16 Aug 2023 13:42:21 +0200 Subject: [PATCH 016/133] Fix vdi-ref when static vdis are used When static vdis are used there is no snapshots and we don't want to call method from XAPI. Signed-off-by: Guillaume --- drivers/LVHDSR.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/LVHDSR.py b/drivers/LVHDSR.py index dd8e20b9..6ac3f804 100755 --- a/drivers/LVHDSR.py +++ b/drivers/LVHDSR.py @@ -1532,10 +1532,11 @@ def detach(self, sr_uuid, vdi_uuid): elif self.sr.provision == "thick": needDeflate = False # except for snapshots, which are always deflated - vdi_ref = self.sr.srcmd.params['vdi_ref'] - snap = self.session.xenapi.VDI.get_is_a_snapshot(vdi_ref) - if snap: - needDeflate = True + if self.sr.srcmd.cmd != 'vdi_detach_from_config': + vdi_ref = self.sr.srcmd.params['vdi_ref'] + snap = self.session.xenapi.VDI.get_is_a_snapshot(vdi_ref) + if snap: + needDeflate = True if needDeflate: try: From 9e7f3ac4bcd60a96c29056539976ce496418f898 Mon Sep 17 00:00:00 2001 From: Samuel Verschelde Date: Fri, 27 Jan 2023 12:03:15 +0100 Subject: [PATCH 017/133] Tell users not to edit multipath.conf directly This file is meant to remain unchanged and regularly updated along with the SM component. Users can create a custom configuration file in /etc/multipath/conf.d/ instead. Signed-off-by: Samuel Verschelde (cherry picked from commit b44d3f5db6d22d5a649fffebebb284afd6da39f7) --- multipath/multipath.conf | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/multipath/multipath.conf b/multipath/multipath.conf index 1073faa0..3de11441 100644 --- a/multipath/multipath.conf +++ b/multipath/multipath.conf @@ -1,3 +1,11 @@ +# --- WARNING: DO NOT EDIT THIS FILE --- +# The contents of this file may be overwritten at any future time through a +# system update, causing any custom configuration to be lost. +# +# For custom multipath configuration, create a separate .conf file in the +# /etc/multipath/conf.d/ directory. +# --- END OF WARNING --- + # This configuration file is used to overwrite the built-in configuration of # multipathd. # For information on the syntax refer to `man multipath.conf` and the examples From 20616a73ecb5f6a4c2609768273b33280d294ee6 Mon Sep 17 00:00:00 2001 From: Samuel Verschelde Date: Fri, 27 Jan 2023 12:23:13 +0100 Subject: [PATCH 018/133] Add custom.conf multipath configuration file Meant to be installed as /etc/multipath/conf.d/custom.conf for users to have an easy entry point for editing, as well as information on what will happen to this file through future system updates and upgrades. Signed-off-by: Samuel Verschelde (cherry picked from commit 18b79a5b1f49c02d8ee8ba2408d0395d1cfb3ee9) --- multipath/custom.conf | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 multipath/custom.conf diff --git a/multipath/custom.conf b/multipath/custom.conf new file mode 100644 index 00000000..3c8583f1 --- /dev/null +++ b/multipath/custom.conf @@ -0,0 +1,6 @@ +# Custom configuration for multipathd + +# Changes made to this file will not be overwritten by future system updates. +# They will also be retained through system upgrades to newer releases. + +# Refer to "man multipath.conf" From 398471d96cfea0424c2adae1630f085d8bba0986 Mon Sep 17 00:00:00 2001 From: Samuel Verschelde Date: Fri, 25 Aug 2023 17:47:34 +0200 Subject: [PATCH 019/133] Install /etc/multipath/conf.d/custom.conf Update Makefile so that the file is installed along with sm. Signed-off-by: Samuel Verschelde --- Makefile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Makefile b/Makefile index b0ae3536..3357cbfb 100755 --- a/Makefile +++ b/Makefile @@ -78,6 +78,7 @@ SM_LIBS += sr_health_check UDEV_RULES = 65-multipath 55-xs-mpath-scsidev 57-usb 58-xapi MPATH_DAEMON = sm-multipath MPATH_CONF = multipath.conf +MPATH_CUSTOM_CONF = custom.conf SMLOG_CONF = SMlog SM_XML := XE_SR_ERRORCODES @@ -93,6 +94,7 @@ UDEV_SCRIPTS_DIR := /etc/udev/scripts/ SYSTEMD_SERVICE_DIR := /usr/lib/systemd/system/ INIT_DIR := /etc/rc.d/init.d/ MPATH_CONF_DIR := /etc/multipath.xenserver/ +MPATH_CUSTOM_CONF_DIR := /etc/multipath/conf.d/ MODPROBE_DIR := /etc/modprobe.d/ EXTENSION_SCRIPT_DEST := /etc/xapi.d/extensions/ LOGROTATE_DIR := /etc/logrotate.d/ @@ -146,6 +148,7 @@ install: precheck mkdir -p $(SM_STAGING)$(INIT_DIR) mkdir -p $(SM_STAGING)$(SYSTEMD_SERVICE_DIR) mkdir -p $(SM_STAGING)$(MPATH_CONF_DIR) + mkdir -p $(SM_STAGING)$(MPATH_CUSTOM_CONF_DIR) mkdir -p $(SM_STAGING)$(MODPROBE_DIR) mkdir -p $(SM_STAGING)$(LOGROTATE_DIR) mkdir -p $(SM_STAGING)$(DEBUG_DEST) @@ -163,6 +166,8 @@ install: precheck $(SM_STAGING)$(SM_DEST)/plugins/ install -m 644 multipath/$(MPATH_CONF) \ $(SM_STAGING)/$(MPATH_CONF_DIR) + install -m 644 multipath/$(MPATH_CUSTOM_CONF) \ + $(SM_STAGING)/$(MPATH_CUSTOM_CONF_DIR) install -m 755 multipath/sm-multipath \ $(SM_STAGING)/$(INIT_DIR) install -m 755 multipath/multipath-root-setup \ From 71e0c5261213427b61a80c932f4184281e42ce61 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Wed, 16 Feb 2022 18:24:56 +0100 Subject: [PATCH 020/133] Fix timeout_call: alarm must be reset in case of success Otherwise the SIGALRM signal can be emitted after the execution of the given user function. Signed-off-by: Ronan Abhamon --- drivers/util.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/util.py b/drivers/util.py index f6739373..c479de98 100755 --- a/drivers/util.py +++ b/drivers/util.py @@ -850,9 +850,8 @@ def handler(signum, frame): signal.alarm(timeoutseconds) try: function(*arguments) - except: + finally: signal.alarm(0) - raise def _incr_iscsiSR_refcount(targetIQN, uuid): From f2d089bcbf24f5144fe5a8d2e4c2ae64878790d5 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Wed, 16 Feb 2022 18:28:06 +0100 Subject: [PATCH 021/133] timeout_call returns the result of user function now Signed-off-by: Ronan Abhamon --- drivers/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/util.py b/drivers/util.py index c479de98..f4f62525 100755 --- a/drivers/util.py +++ b/drivers/util.py @@ -849,7 +849,7 @@ def handler(signum, frame): signal.signal(signal.SIGALRM, handler) signal.alarm(timeoutseconds) try: - function(*arguments) + return function(*arguments) finally: signal.alarm(0) From 54923510e18471fcbdb64caa24d952a3fb3f6b59 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Fri, 20 Nov 2020 16:42:52 +0100 Subject: [PATCH 022/133] fix(LinstorSR): repair volumes only if an exclusive command is executed Signed-off-by: Ronan Abhamon --- drivers/LinstorSR.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index 8be18367..a5bf5abd 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -388,10 +388,16 @@ def wrap(self, *args, **kwargs): try: # Try to open SR if exists. + # We can repair only if we are on the master AND if + # we are trying to execute an exclusive operation. + # Otherwise we could try to delete a VDI being created or + # during a snapshot. An exclusive op is the guarantee that the + # SR is locked. self._linstor = LinstorVolumeManager( self._master_uri, self._group_name, - repair=self._is_master, + repair=self._is_master and + self.srcmd.cmd in self.ops_exclusive, logger=util.SMlog ) self._vhdutil = LinstorVhdUtil(self.session, self._linstor) From d9537aa84653c9b8f4691da10be3f929776ea687 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Thu, 10 Dec 2020 17:56:15 +0100 Subject: [PATCH 023/133] feat(LinstorSR): Improve LINSTOR performance Details: - vdi_attach and vdi_detach are now exclusive - lock volumes on slaves (when vdi_xxx command is used) and avoid release if a timeout is reached - load all VDIs only when necessary, so only if it exists at least a journal entry or if sr_scan/sr_attach is executed - use a __slots__ attr in LinstorVolumeManager to increase performance - use a cache directly in LinstorVolumeManager to reduce network request count with LINSTOR - try to always use the same LINSTOR KV object to limit netwok usage - use a cache to avoid a new JSON parsing when all VDIs are loaded in LinstorSR - limit request count when LINSTOR storage pool info is fetched using a fetch interval - avoid race condition in cleanup: check if a volume is locked in a slave or not before modify it - ... Signed-off-by: Ronan Abhamon --- drivers/LinstorSR.py | 150 ++++++++--- drivers/cleanup.py | 52 ++-- drivers/linstor-manager | 9 +- drivers/linstorvhdutil.py | 58 ++++- drivers/linstorvolumemanager.py | 432 +++++++++++++++++++------------- 5 files changed, 463 insertions(+), 238 deletions(-) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index a5bf5abd..548f4b17 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -92,7 +92,8 @@ OPS_EXCLUSIVE = [ 'sr_create', 'sr_delete', 'sr_attach', 'sr_detach', 'sr_scan', - 'sr_update', 'vdi_create', 'vdi_delete', 'vdi_clone', 'vdi_snapshot' + 'sr_update', 'sr_probe', 'vdi_init', 'vdi_create', 'vdi_delete', + 'vdi_attach', 'vdi_detach', 'vdi_clone', 'vdi_snapshot', ] # ============================================================================== @@ -185,7 +186,9 @@ def detach_thin(session, linstor, sr_uuid, vdi_uuid): volume_info = linstor.get_volume_info(vdi_uuid) old_volume_size = volume_info.virtual_size - deflate(vdi_uuid, device_path, new_volume_size, old_volume_size) + deflate( + linstor, vdi_uuid, device_path, new_volume_size, old_volume_size + ) finally: lock.release() @@ -215,11 +218,11 @@ def inflate(journaler, linstor, vdi_uuid, vdi_path, new_size, old_size): opterr='Failed to zero out VHD footer {}'.format(vdi_path) ) - vhdutil.setSizePhys(vdi_path, new_size, False) + LinstorVhdUtil(None, linstor).set_size_phys(vdi_path, new_size, False) journaler.remove(LinstorJournaler.INFLATE, vdi_uuid) -def deflate(vdi_uuid, vdi_path, new_size, old_size): +def deflate(linstor, vdi_uuid, vdi_path, new_size, old_size): new_size = LinstorVolumeManager.round_up_volume_size(new_size) if new_size >= old_size: return @@ -229,7 +232,7 @@ def deflate(vdi_uuid, vdi_path, new_size, old_size): .format(vdi_uuid, new_size, old_size) ) - vhdutil.setSizePhys(vdi_path, new_size) + LinstorVhdUtil(None, linstor).set_size_phys(vdi_path, new_size) # TODO: Change the LINSTOR volume size using linstor.resize_volume. @@ -318,10 +321,13 @@ def load(self, sr_uuid): self._group_name = self.dconf['group-name'] self._master_uri = None - self._vdi_shared_locked = False + self._vdi_shared_time = 0 self._initialized = False + self._all_volume_info_cache = None + self._all_volume_metadata_cache = None + def _locked_load(method): @functools.wraps(method) def wrap(self, *args, **kwargs): @@ -374,7 +380,7 @@ def wrap(self, *args, **kwargs): # behaviors if the GC is executed during an action on a slave. if self.cmd.startswith('vdi_'): self._shared_lock_vdi(self.srcmd.params['vdi_uuid']) - self._vdi_shared_locked = True + self._vdi_shared_time = time.time() self._journaler = LinstorJournaler( self._master_uri, self._group_name, logger=util.SMlog @@ -396,8 +402,10 @@ def wrap(self, *args, **kwargs): self._linstor = LinstorVolumeManager( self._master_uri, self._group_name, - repair=self._is_master and - self.srcmd.cmd in self.ops_exclusive, + repair=( + self._is_master and + self.srcmd.cmd in self.ops_exclusive + ), logger=util.SMlog ) self._vhdutil = LinstorVhdUtil(self.session, self._linstor) @@ -422,22 +430,55 @@ def wrap(self, *args, **kwargs): if hosts: util.SMlog('Failed to join node(s): {}'.format(hosts)) + # Ensure we use a non-locked volume when vhdutil is called. + if ( + self._is_master and self.cmd.startswith('vdi_') and + self.cmd != 'vdi_create' + ): + self._linstor.ensure_volume_is_not_locked( + self.srcmd.params['vdi_uuid'] + ) + try: - # If the command is a SR command on the master, we must - # load all VDIs and clean journal transactions. - # We must load the VDIs in the snapshot case too. + # If the command is a SR scan command on the master, + # we must load all VDIs and clean journal transactions. + # We must load the VDIs in the snapshot case too only if + # there is at least one entry in the journal. + # + # If the command is a SR command we want at least to remove + # resourceless volumes. if self._is_master and self.cmd not in [ 'vdi_attach', 'vdi_detach', 'vdi_activate', 'vdi_deactivate', 'vdi_epoch_begin', 'vdi_epoch_end', 'vdi_update', 'vdi_destroy' ]: - self._load_vdis() - self._undo_all_journal_transactions() + load_vdis = ( + self.cmd == 'sr_scan' or + self.cmd == 'sr_attach' + ) or len( + self._journaler.get_all(LinstorJournaler.INFLATE) + ) or len( + self._journaler.get_all(LinstorJournaler.CLONE) + ) + + if load_vdis: + # We use a cache to avoid repeated JSON parsing. + # The performance gain is not big but we can still + # enjoy it with a few lines. + self._create_linstor_cache() + self._load_vdis() + self._destroy_linstor_cache() + + self._undo_all_journal_transactions() self._linstor.remove_resourceless_volumes() self._synchronize_metadata() except Exception as e: + if self.cmd == 'sr_scan': + # Always raise, we don't want to remove VDIs + # from the XAPI database otherwise. + raise e util.SMlog( 'Ignoring exception in LinstorSR.load: {}'.format(e) ) @@ -449,7 +490,7 @@ def wrap(self, *args, **kwargs): @_locked_load def cleanup(self): - if self._vdi_shared_locked: + if self._vdi_shared_time: self._shared_lock_vdi(self.srcmd.params['vdi_uuid'], locked=False) @_locked_load @@ -605,6 +646,23 @@ def _shared_lock_vdi(self, vdi_uuid, locked=True): 'locked': str(locked) } + # Note: We must avoid to unlock the volume if the timeout is reached + # because during volume unlock, the SR lock is not used. Otherwise + # we could destroy a valid lock acquired from another host... + # + # This code is not very clean, the ideal solution would be to acquire + # the SR lock during volume unlock (like lock) but it's not easy + # to implement without impacting performance. + if not locked: + elapsed_time = time.time() - self._vdi_shared_time + timeout = LinstorVolumeManager.LOCKED_EXPIRATION_DELAY * 0.7 + if elapsed_time >= timeout: + util.SMlog( + 'Avoid unlock call of {} because timeout has been reached' + .format(vdi_uuid) + ) + return + ret = self.session.xenapi.host.call_plugin( master, self.MANAGER_PLUGIN, method, args ) @@ -659,7 +717,7 @@ def _synchronize_metadata_and_xapi(self): # Now update the VDI information in the metadata if required. xenapi = self.session.xenapi - volumes_metadata = self._linstor.volumes_with_metadata + volumes_metadata = self._linstor.get_volumes_with_metadata() for vdi_uuid, volume_metadata in volumes_metadata.items(): try: vdi_ref = xenapi.VDI.get_by_uuid(vdi_uuid) @@ -751,8 +809,8 @@ def _load_vdis(self): xapi_vdi_uuids.add(xenapi.VDI.get_uuid(vdi)) # 2. Get volumes info. - all_volume_info = self._linstor.volumes_with_info - volumes_metadata = self._linstor.volumes_with_metadata + all_volume_info = self._all_volume_info_cache + volumes_metadata = self._all_volume_metadata_cache # 3. Get CBT vdis. # See: https://support.citrix.com/article/CTX230619 @@ -1020,13 +1078,13 @@ def _handle_interrupted_inflate(self, vdi_uuid, old_size): util.SMlog('Cannot deflate missing VDI {}'.format(vdi_uuid)) return - current_size = self._linstor.get_volume_info(self.uuid).virtual_size + current_size = self._all_volume_info_cache.get(self.uuid).virtual_size util.zeroOut( vdi.path, current_size - vhdutil.VHD_FOOTER_SIZE, vhdutil.VHD_FOOTER_SIZE ) - deflate(vdi_uuid, vdi.path, old_size, current_size) + deflate(self._linstor, vdi_uuid, vdi.path, old_size, current_size) def _handle_interrupted_clone( self, vdi_uuid, clone_info, force_undo=False @@ -1039,7 +1097,7 @@ def _handle_interrupted_clone( base_uuid, snap_uuid = clone_info.split('_') # Use LINSTOR data because new VDIs may not be in the XAPI. - volume_names = self._linstor.volumes_with_name + volume_names = self._linstor.get_volumes_with_name() # Check if we don't have a base VDI. (If clone failed at startup.) if base_uuid not in volume_names: @@ -1095,7 +1153,7 @@ def _undo_clone(self, volume_names, vdi_uuid, base_uuid, snap_uuid): if base_type == vhdutil.VDI_TYPE_VHD: vhd_info = self._vhdutil.get_vhd_info(base_uuid, False) if vhd_info.hidden: - vhdutil.setHidden(base_path, False) + self._vhdutil.set_hidden(base_path, False) elif base_type == vhdutil.VDI_TYPE_RAW and \ base_metadata.get(HIDDEN_TAG): self._linstor.update_volume_metadata( @@ -1156,6 +1214,19 @@ def _undo_clone(self, volume_names, vdi_uuid, base_uuid, snap_uuid): util.SMlog('*** INTERRUPTED CLONE OP: rollback success') + # -------------------------------------------------------------------------- + # Cache. + # -------------------------------------------------------------------------- + + def _create_linstor_cache(self): + self._all_volume_metadata_cache = \ + self._linstor.get_volumes_with_metadata() + self._all_volume_info_cache = self._linstor.get_volumes_with_info() + + def _destroy_linstor_cache(self): + self._all_volume_info_cache = None + self._all_volume_metadata_cache = None + # -------------------------------------------------------------------------- # Misc. # -------------------------------------------------------------------------- @@ -1326,16 +1397,16 @@ def create(self, sr_uuid, vdi_uuid, size): if self.vdi_type == vhdutil.VDI_TYPE_RAW: self.size = volume_info.virtual_size else: - vhdutil.create( + self.sr._vhdutil.create( self.path, size, False, self.MAX_METADATA_VIRT_SIZE ) self.size = self.sr._vhdutil.get_size_virt(self.uuid) if self._key_hash: - vhdutil.setKey(self.path, self._key_hash) + self.sr._vhdutil.set_key(self.path, self._key_hash) # Because vhdutil commands modify the volume data, - # we must retrieve a new time the utilisation size. + # we must retrieve a new time the utilization size. volume_info = self._linstor.get_volume_info(self.uuid) volume_metadata = { @@ -1548,7 +1619,7 @@ def resize(self, sr_uuid, vdi_uuid, size): self.sr._journaler, self._linstor, self.uuid, self.path, new_volume_size, old_volume_size ) - vhdutil.setSizeVirtFast(self.path, size) + self.sr._vhdutil.set_size_virt_fast(self.path, size) # Reload size attributes. self._load_this() @@ -1580,8 +1651,8 @@ def compose(self, sr_uuid, vdi1, vdi2): if not blktap2.VDI.tap_pause(self.session, self.sr.uuid, self.uuid): raise util.SMException('Failed to pause VDI {}'.format(self.uuid)) try: - vhdutil.setParent(self.path, parent_path, False) - vhdutil.setHidden(parent_path) + self.sr._vhdutil.set_parent(self.path, parent_path, False) + self.sr._vhdutil.set_hidden(parent_path) self.sr.session.xenapi.VDI.set_managed( self.sr.srcmd.params['args'][0], False ) @@ -1658,11 +1729,20 @@ def reset_leaf(self, sr_uuid, vdi_uuid): .format(self.uuid) ) - vhdutil.killData(self.path) + self.sr._vhdutil.kill_data(self.path) def _load_this(self): - volume_metadata = self._linstor.get_volume_metadata(self.uuid) - volume_info = self._linstor.get_volume_info(self.uuid) + volume_metadata = None + if self.sr._all_volume_metadata_cache: + volume_metadata = self.sr._all_volume_metadata_cache.get(self.uuid) + if volume_metadata is None: + volume_metadata = self._linstor.get_volume_metadata(self.uuid) + + volume_info = None + if self.sr._all_volume_info_cache: + volume_info = self.sr._all_volume_info_cache.get(self.uuid) + if volume_info is None: + volume_info = self._linstor.get_volume_info(self.uuid) # Contains the physical size used on all disks. # When LINSTOR LVM driver is used, the size should be similar to @@ -1697,7 +1777,7 @@ def _mark_hidden(self, hidden=True): return if self.vdi_type == vhdutil.VDI_TYPE_VHD: - vhdutil.setHidden(self.path, hidden) + self.sr._vhdutil.set_hidden(self.path, hidden) else: self._linstor.update_volume_metadata(self.uuid, { HIDDEN_TAG: hidden @@ -1813,9 +1893,7 @@ def _determine_type_and_path(self): 'VDIUnavailable', opterr='failed to get vdi_type in metadata' ) - self._update_device_name( - self._linstor.get_volume_name(self.uuid) - ) + self._update_device_name(self._linstor.get_volume_name(self.uuid)) def _update_device_name(self, device_name): self._device_name = device_name @@ -1838,7 +1916,7 @@ def _create_snapshot(self, snap_uuid, snap_of_uuid=None): # 2. Write the snapshot content. is_raw = (self.vdi_type == vhdutil.VDI_TYPE_RAW) - vhdutil.snapshot( + self.sr._vhdutil.snapshot( snap_path, self.path, is_raw, self.MAX_METADATA_VIRT_SIZE ) diff --git a/drivers/cleanup.py b/drivers/cleanup.py index b027d882..74c4de07 100755 --- a/drivers/cleanup.py +++ b/drivers/cleanup.py @@ -449,7 +449,7 @@ def srUpdate(self): # # VDI # -class VDI: +class VDI(object): """Object representing a VDI of a VHD-based SR""" POLL_INTERVAL = 1 @@ -1425,17 +1425,15 @@ def delete(self): self.sr.unlock() VDI.delete(self) - def pauseVDIs(self, vdiList): - self.sr._linstor.ensure_volume_list_is_not_locked( - vdiList, timeout=self.VOLUME_LOCK_TIMEOUT - ) - return super(VDI).pauseVDIs(vdiList) + def validate(self, fast=False): + if not self.sr._vhdutil.check(self.uuid, fast=fast): + raise util.SMException('VHD {} corrupted'.format(self)) - def _liveLeafCoalesce(self, vdi): + def pause(self, failfast=False): self.sr._linstor.ensure_volume_is_not_locked( - vdi.uuid, timeout=self.VOLUME_LOCK_TIMEOUT + self.uuid, timeout=self.VOLUME_LOCK_TIMEOUT ) - return super(VDI)._liveLeafCoalesce(vdi) + return super(LinstorVDI, self).pause(failfast) def _relinkSkip(self): abortFlag = IPCFlag(self.sr.uuid) @@ -1479,7 +1477,7 @@ def _queryVHDBlocks(self): # # SR # -class SR: +class SR(object): class LogFilter: def __init__(self, sr): self.sr = sr @@ -2897,6 +2895,12 @@ def scan(self, force=False): self.logFilter.logState() self._handleInterruptedCoalesceLeaf() + def pauseVDIs(self, vdiList): + self._linstor.ensure_volume_list_is_not_locked( + vdiList, timeout=LinstorVDI.VOLUME_LOCK_TIMEOUT + ) + return super(LinstorSR, self).pauseVDIs(vdiList) + def _reloadLinstor(self): session = self.xapi.session host_ref = util.get_this_host_ref(session) @@ -2947,8 +2951,8 @@ def _load_vdi_info(self): # TODO: Ensure metadata contains the right info. - all_volume_info = self._linstor.volumes_with_info - volumes_metadata = self._linstor.volumes_with_metadata + all_volume_info = self._linstor.get_volumes_with_info() + volumes_metadata = self._linstor.get_volumes_with_metadata() for vdi_uuid, volume_info in all_volume_info.items(): try: if not volume_info.name and \ @@ -2979,8 +2983,10 @@ def _calcExtraSpaceNeeded(self, child, parent): virtual_size = LinstorVolumeManager.round_up_volume_size( parent.sizeVirt + meta_overhead + bitmap_overhead ) - # TODO: Check result. - return virtual_size - self._linstor.get_volume_size(parent.uuid) + volume_size = self._linstor.get_volume_size(parent.uuid) + + assert virtual_size >= volume_size + return virtual_size - volume_size def _hasValidDevicePath(self, uuid): try: @@ -2990,6 +2996,16 @@ def _hasValidDevicePath(self, uuid): return False return True + def _liveLeafCoalesce(self, vdi): + self.lock() + try: + self._linstor.ensure_volume_is_not_locked( + vdi.uuid, timeout=LinstorVDI.VOLUME_LOCK_TIMEOUT + ) + return super(LinstorSR, self)._liveLeafCoalesce(vdi) + finally: + self.unlock() + def _handleInterruptedCoalesceLeaf(self): entries = self.journaler.get_all(VDI.JRN_LEAF) for uuid, parentUuid in entries.iteritems(): @@ -3016,7 +3032,6 @@ def _undoInterruptedCoalesceLeaf(self, childUuid, parentUuid): 'Renaming parent back: {} -> {}'.format(childUuid, parentUuid) ) parent.rename(parentUuid) - util.fistpoint.activate('LVHDRT_coaleaf_undo_after_rename', self.uuid) child = self.getVDI(childUuid) if not child: @@ -3032,9 +3047,6 @@ def _undoInterruptedCoalesceLeaf(self, childUuid, parentUuid): Util.log('Updating the VDI record') child.setConfig(VDI.DB_VHD_PARENT, parentUuid) child.setConfig(VDI.DB_VDI_TYPE, vhdutil.VDI_TYPE_VHD) - util.fistpoint.activate( - 'LVHDRT_coaleaf_undo_after_rename2', self.uuid - ) # TODO: Maybe deflate here. @@ -3043,10 +3055,7 @@ def _undoInterruptedCoalesceLeaf(self, childUuid, parentUuid): if not parent.hidden: parent._setHidden(True) self._updateSlavesOnUndoLeafCoalesce(parent, child) - util.fistpoint.activate('LVHDRT_coaleaf_undo_end', self.uuid) Util.log('*** leaf-coalesce undo successful') - if util.fistpoint.is_active('LVHDRT_coaleaf_stop_after_recovery'): - child.setConfig(VDI.DB_LEAFCLSC, VDI.LEAFCLSC_DISABLED) def _finishInterruptedCoalesceLeaf(self, childUuid, parentUuid): Util.log('*** FINISH LEAF-COALESCE') @@ -3059,7 +3068,6 @@ def _finishInterruptedCoalesceLeaf(self, childUuid, parentUuid): except XenAPI.Failure: pass self._updateSlavesOnResize(vdi) - util.fistpoint.activate('LVHDRT_coaleaf_finish_end', self.uuid) Util.log('*** finished leaf-coalesce successfully') def _checkSlaves(self, vdi): diff --git a/drivers/linstor-manager b/drivers/linstor-manager index f7ce1809..e7e58fd8 100755 --- a/drivers/linstor-manager +++ b/drivers/linstor-manager @@ -118,7 +118,9 @@ def detach(session, args): def check(session, args): try: device_path = args['devicePath'] - return str(vhdutil.check(device_path)) + ignore_missing_footer = args['ignoreMissingFooter'] + fast = args['fast'] + return str(vhdutil.check(device_path, ignore_missing_footer, fast)) except Exception as e: util.SMlog('linstor-manager:check error: {}'.format(e)) raise @@ -236,7 +238,10 @@ def lock_vdi(session, args): group_name = args['groupName'] locked = distutils.util.strtobool(args['locked']) + # We must lock to mark the VDI. lock = Lock(vhdutil.LOCK_TYPE_SR, sr_uuid) + if locked: + lock.acquire() linstor = LinstorVolumeManager( get_linstor_uri(session), @@ -249,7 +254,7 @@ def lock_vdi(session, args): except Exception as e: util.SMlog('linstor-manager:lock_vdi error: {}'.format(e)) finally: - if lock: + if locked and lock: lock.release() return str(False) diff --git a/drivers/linstorvhdutil.py b/drivers/linstorvhdutil.py index f31c7525..ac858371 100644 --- a/drivers/linstorvhdutil.py +++ b/drivers/linstorvhdutil.py @@ -89,13 +89,33 @@ def exec_remote_method(): return decorated +def linstormodifier(): + def decorated(func): + def wrapper(*args, **kwargs): + self = args[0] + + ret = func(*args, **kwargs) + self._linstor.invalidate_resource_cache() + return ret + return wrapper + return decorated + + class LinstorVhdUtil: def __init__(self, session, linstor): self._session = session self._linstor = linstor + # -------------------------------------------------------------------------- + # Getters. + # -------------------------------------------------------------------------- + + def check(self, vdi_uuid, ignore_missing_footer=False, fast=False): + kwargs = {'ignoreMissingFooter': ignore_missing_footer, 'fast': fast} + return self._check(vdi_uuid, **kwargs) + @linstorhostcall(vhdutil.check, 'check') - def check(self, vdi_uuid, **kwargs): + def _check(self, vdi_uuid, **kwargs): return distutils.util.strtobool(kwargs['response']) def get_vhd_info(self, vdi_uuid, include_parent=True): @@ -148,6 +168,42 @@ def get_key_hash(self, vdi_uuid, **kwargs): def get_block_bitmap(self, vdi_uuid, **kwargs): return base64.b64decode(kwargs['response']) + # -------------------------------------------------------------------------- + # Setters. + # -------------------------------------------------------------------------- + + @linstormodifier() + def create(self, path, size, static, msize=0): + return vhdutil.create(path, size, static, msize) + + @linstormodifier() + def set_size_virt_fast(self, path, size): + return vhdutil.setSizeVirtFast(path, size) + + @linstormodifier() + def set_size_phys(self, path, size, debug=True): + return vhdutil.setSizePhys(path, size, debug) + + @linstormodifier() + def set_parent(self, path, parentPath, parentRaw): + return vhdutil.setParent(path, parentPath, parentRaw) + + @linstormodifier() + def set_hidden(self, path, hidden=True): + return vhdutil.setHidden(path, hidden) + + @linstormodifier() + def set_key(self, path, key_hash): + return vhdutil.setKey(path, key_hash) + + @linstormodifier() + def kill_data(self, path): + return vhdutil.killData(path) + + @linstormodifier() + def snapshot(self, path, parent, parentRaw, msize=0, checkEmpty=True): + return vhdutil.snapshot(path, parent, parentRaw, msize, checkEmpty) + # -------------------------------------------------------------------------- # Helpers. # -------------------------------------------------------------------------- diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py index d4004217..d617655f 100755 --- a/drivers/linstorvolumemanager.py +++ b/drivers/linstorvolumemanager.py @@ -63,6 +63,16 @@ class LinstorVolumeManager(object): A volume in this context is a physical part of the storage layer. """ + __slots__ = ( + '_linstor', '_logger', + '_uri', '_base_group_name', + '_redundancy', '_group_name', + '_volumes', '_storage_pools', + '_storage_pools_time', + '_kv_cache', '_resource_cache', '_volume_info_cache', + '_kv_cache_dirty', '_resource_cache_dirty', '_volume_info_cache_dirty' + ) + DEV_ROOT_PATH = '/dev/drbd/by-res/' # Default LVM extent size. @@ -106,6 +116,10 @@ class LinstorVolumeManager(object): PREFIX_SR = 'xcp-sr-' PREFIX_VOLUME = 'xcp-volume-' + # Limit request number when storage pool info is asked, we fetch + # the current pool status after N elapsed seconds. + STORAGE_POOLS_FETCH_INTERVAL = 15 + @staticmethod def default_logger(*args): print(args) @@ -164,6 +178,16 @@ def __init__( self._logger = logger self._redundancy = groups[0].select_filter.place_count self._group_name = group_name + self._volumes = set() + self._storage_pools_time = 0 + + # To increate performance and limit request count to LINSTOR services, + # we use caches. + self._kv_cache = self._create_kv_cache() + self._resource_cache = None + self._resource_cache_dirty = True + self._volume_info_cache = None + self._volume_info_cache_dirty = True self._build_volumes(repair=repair) @property @@ -184,66 +208,6 @@ def volumes(self): """ return self._volumes - @property - def volumes_with_name(self): - """ - Give a volume dictionnary that contains names actually owned. - :return: A volume/name dict. - :rtype: dict(str, str) - """ - return self._get_volumes_by_property(self.REG_VOLUME_NAME) - - @property - def volumes_with_info(self): - """ - Give a volume dictionnary that contains VolumeInfos. - :return: A volume/VolumeInfo dict. - :rtype: dict(str, VolumeInfo) - """ - - volumes = {} - - all_volume_info = self._get_volumes_info() - volume_names = self.volumes_with_name - for volume_uuid, volume_name in volume_names.items(): - if volume_name: - volume_info = all_volume_info.get(volume_name) - if volume_info: - volumes[volume_uuid] = volume_info - continue - - # Well I suppose if this volume is not available, - # LINSTOR has been used directly without using this API. - volumes[volume_uuid] = self.VolumeInfo('') - - return volumes - - @property - def volumes_with_metadata(self): - """ - Give a volume dictionnary that contains metadata. - :return: A volume/metadata dict. - :rtype: dict(str, dict) - """ - - volumes = {} - - metadata = self._get_volumes_by_property(self.REG_METADATA) - for volume_uuid, volume_metadata in metadata.items(): - if volume_metadata: - volume_metadata = json.loads(volume_metadata) - if isinstance(volume_metadata, dict): - volumes[volume_uuid] = volume_metadata - continue - raise LinstorVolumeManagerError( - 'Expected dictionary in volume metadata: {}' - .format(volume_uuid) - ) - - volumes[volume_uuid] = {} - - return volumes - @property def max_volume_size_allowed(self): """ @@ -292,7 +256,7 @@ def total_allocated_volume_size(self): """ size = 0 - for resource in self._linstor.resource_list_raise().resources: + for resource in self._get_resource_cache().resources: for volume in resource.volumes: # We ignore diskless pools of the form "DfltDisklessStorPool". if volume.storage_pool_name == self._group_name: @@ -346,12 +310,8 @@ def disconnected_hosts(self): :rtype: set(str) """ - pools = self._linstor.storage_pool_list_raise( - filter_by_stor_pools=[self._group_name] - ).storage_pools - disconnected_hosts = set() - for pool in pools: + for pool in self._get_storage_pools(): for report in pool.reports: if report.ret_code & linstor.consts.WARN_NOT_CONNECTED == \ linstor.consts.WARN_NOT_CONNECTED: @@ -397,7 +357,7 @@ def create_volume(self, volume_uuid, size, persistent=True): ) return device_path except Exception: - self._force_destroy_volume(volume_uuid, volume_properties) + self._force_destroy_volume(volume_uuid) raise def mark_volume_as_persistent(self, volume_uuid): @@ -426,7 +386,7 @@ def destroy_volume(self, volume_uuid): volume_properties[self.PROP_NOT_EXISTS] = self.STATE_NOT_EXISTS self._volumes.remove(volume_uuid) - self._destroy_volume(volume_uuid, volume_properties) + self._destroy_volume(volume_uuid) def lock_volume(self, volume_uuid, locked=True): """ @@ -476,12 +436,15 @@ def ensure_volume_list_is_not_locked(self, volume_uuids, timeout=None): waiting = False + volume_properties = self._get_kv_cache() + start = time.time() while True: # Can't delete in for loop, use a copy of the list. remaining = checked.copy() for volume_uuid in checked: - volume_properties = self._get_volume_properties(volume_uuid) + volume_properties.namespace = \ + self._build_volume_namespace(volume_uuid) timestamp = volume_properties.get( self.PROP_IS_READONLY_TIMESTAMP ) @@ -519,6 +482,7 @@ def ensure_volume_list_is_not_locked(self, volume_uuids, timeout=None): # We must wait to use the volume. After that we can modify it # ONLY if the SR is locked to avoid bad reads on the slaves. time.sleep(1) + volume_properties = self._create_kv_cache() if waiting: self._logger('No volume locked now!') @@ -542,6 +506,9 @@ def resize_volume(self, volume_uuid, new_size): volume_nr=0, size=new_size / 1024 ) + + self._mark_resource_cache_as_dirty() + error_str = self._get_error_str(result) if error_str: raise LinstorVolumeManagerError( @@ -596,7 +563,7 @@ def get_volume_info(self, volume_uuid): """ volume_name = self.get_volume_name(volume_uuid) - return self._get_volumes_info(filter=[volume_name])[volume_name] + return self._get_volumes_info()[volume_name] def get_device_path(self, volume_uuid): """ @@ -620,7 +587,7 @@ def get_volume_uuid_from_device_path(self, device_path): expected_volume_name = \ self.get_volume_name_from_device_path(device_path) - volume_names = self.volumes_with_name + volume_names = self.get_volumes_with_name() for volume_uuid, volume_name in volume_names.items(): if volume_name == expected_volume_name: return volume_uuid @@ -638,9 +605,11 @@ def get_volume_name_from_device_path(self, device_path): """ node_name = socket.gethostname() - resources = self._linstor.resource_list_raise( - filter_by_nodes=[node_name] - ).resources + + resources = filter( + lambda resource: resource.node_name == node_name, + self._get_resource_cache().resources + ) real_device_path = os.path.realpath(device_path) for resource in resources: @@ -664,6 +633,8 @@ def update_volume_uuid(self, volume_uuid, new_volume_uuid, force=False): deleted VDI. """ + assert volume_uuid != new_volume_uuid + self._logger( 'Trying to update volume UUID {} to {}...' .format(volume_uuid, new_volume_uuid) @@ -685,36 +656,41 @@ def update_volume_uuid(self, volume_uuid, new_volume_uuid, force=False): .format(volume_uuid) ) - new_volume_properties = self._get_volume_properties( + # 1. Copy in temp variables metadata and volume_name. + metadata = volume_properties.get(self.PROP_METADATA) + volume_name = volume_properties.get(self.PROP_VOLUME_NAME) + + # 2. Switch to new volume namespace. + volume_properties.namespace = self._build_volume_namespace( new_volume_uuid ) - if list(new_volume_properties.items()): + + if list(volume_properties.items()): raise LinstorVolumeManagerError( 'Cannot update volume uuid {} to {}: ' .format(volume_uuid, new_volume_uuid) + 'this last one is not empty' ) - assert volume_properties.namespace != \ - new_volume_properties.namespace - try: - # 1. Mark new volume properties with PROP_UPDATING_UUID_SRC. + # 3. Mark new volume properties with PROP_UPDATING_UUID_SRC. # If we crash after that, the new properties can be removed # properly. - new_volume_properties[self.PROP_NOT_EXISTS] = self.STATE_NOT_EXISTS - new_volume_properties[self.PROP_UPDATING_UUID_SRC] = volume_uuid + volume_properties[self.PROP_NOT_EXISTS] = self.STATE_NOT_EXISTS + volume_properties[self.PROP_UPDATING_UUID_SRC] = volume_uuid - # 2. Copy the properties. - for property in [self.PROP_METADATA, self.PROP_VOLUME_NAME]: - new_volume_properties[property] = \ - volume_properties.get(property) + # 4. Copy the properties. + volume_properties[self.PROP_METADATA] = metadata + volume_properties[self.PROP_VOLUME_NAME] = volume_name - # 3. Ok! - new_volume_properties[self.PROP_NOT_EXISTS] = self.STATE_EXISTS + # 5. Ok! + volume_properties[self.PROP_NOT_EXISTS] = self.STATE_EXISTS except Exception as e: try: - new_volume_properties.clear() + # Clear the new volume properties in case of failure. + assert volume_properties.namespace == \ + self._build_volume_namespace(new_volume_uuid) + volume_properties.clear() except Exception as e: self._logger( 'Failed to clear new volume properties: {} (ignoring...)' @@ -725,11 +701,21 @@ def update_volume_uuid(self, volume_uuid, new_volume_uuid, force=False): ) try: - # 4. After this point, it's ok we can remove the + # 6. After this point, it's ok we can remove the # PROP_UPDATING_UUID_SRC property and clear the src properties # without problems. + + # 7. Switch to old volume namespace. + volume_properties.namespace = self._build_volume_namespace( + volume_uuid + ) volume_properties.clear() - new_volume_properties.pop(self.PROP_UPDATING_UUID_SRC) + + # 8. Switch a last time to new volume namespace. + volume_properties.namespace = self._build_volume_namespace( + new_volume_uuid + ) + volume_properties.pop(self.PROP_UPDATING_UUID_SRC) except Exception as e: raise LinstorVolumeManagerError( 'Failed to clear volume properties ' @@ -743,7 +729,7 @@ def update_volume_uuid(self, volume_uuid, new_volume_uuid, force=False): 'UUID update succeeded of {} to {}! (properties={})' .format( volume_uuid, new_volume_uuid, - self._get_filtered_properties(new_volume_properties) + self._get_filtered_properties(volume_properties) ) ) @@ -788,6 +774,63 @@ def get_usage_states(self, volume_uuid): return states + def get_volumes_with_name(self): + """ + Give a volume dictionnary that contains names actually owned. + :return: A volume/name dict. + :rtype: dict(str, str) + """ + return self._get_volumes_by_property(self.REG_VOLUME_NAME) + + def get_volumes_with_info(self): + """ + Give a volume dictionnary that contains VolumeInfos. + :return: A volume/VolumeInfo dict. + :rtype: dict(str, VolumeInfo) + """ + + volumes = {} + + all_volume_info = self._get_volumes_info() + volume_names = self.get_volumes_with_name() + for volume_uuid, volume_name in volume_names.items(): + if volume_name: + volume_info = all_volume_info.get(volume_name) + if volume_info: + volumes[volume_uuid] = volume_info + continue + + # Well I suppose if this volume is not available, + # LINSTOR has been used directly without using this API. + volumes[volume_uuid] = self.VolumeInfo('') + + return volumes + + def get_volumes_with_metadata(self): + """ + Give a volume dictionnary that contains metadata. + :return: A volume/metadata dict. + :rtype: dict(str, dict) + """ + + volumes = {} + + metadata = self._get_volumes_by_property(self.REG_METADATA) + for volume_uuid, volume_metadata in metadata.items(): + if volume_metadata: + volume_metadata = json.loads(volume_metadata) + if isinstance(volume_metadata, dict): + volumes[volume_uuid] = volume_metadata + continue + raise LinstorVolumeManagerError( + 'Expected dictionary in volume metadata: {}' + .format(volume_uuid) + ) + + volumes[volume_uuid] = {} + + return volumes + def get_volume_metadata(self, volume_uuid): """ Get the metadata of a volume. @@ -918,9 +961,9 @@ def find_best_nodes(): )) # 5. Create resources! - def clean(properties): + def clean(): try: - self._destroy_volume(clone_uuid, properties) + self._destroy_volume(clone_uuid) except Exception as e: self._logger( 'Unable to destroy volume {} after shallow clone fail: {}' @@ -946,7 +989,7 @@ def create(): ) return volume_properties except Exception: - clean(volume_properties) + clean() raise # Retry because we can get errors like this: @@ -962,7 +1005,7 @@ def create(): self._volumes.add(clone_uuid) return device_path except Exception as e: - clean(volume_properties) + clean() raise def remove_resourceless_volumes(self): @@ -974,7 +1017,7 @@ def remove_resourceless_volumes(self): """ resource_names = self._fetch_resource_names() - for volume_uuid, volume_name in self.volumes_with_name.items(): + for volume_uuid, volume_name in self.get_volumes_with_name().items(): if not volume_name or volume_name not in resource_names: self.destroy_volume(volume_uuid) @@ -992,11 +1035,7 @@ def destroy(self, force=False): # TODO: What's the required action if it exists remaining volumes? self._destroy_resource_group(self._linstor, self._group_name) - - pools = self._linstor.storage_pool_list_raise( - filter_by_stor_pools=[self._group_name] - ).storage_pools - for pool in pools: + for pool in self._get_storage_pools(force=True): self._destroy_storage_pool( self._linstor, pool.name, pool.node_name ) @@ -1014,10 +1053,13 @@ def find_up_to_date_diskfull_nodes(self, volume_uuid): in_use = False node_names = set() - resource_list = self._linstor.resource_list_raise( - filter_by_resources=[volume_name] + + resource_states = filter( + lambda resource_state: resource_state.name == volume_name, + self._get_resource_cache().resource_states ) - for resource_state in resource_list.resource_states: + + for resource_state in resource_states: volume_state = resource_state.volume_states[0] if volume_state.disk_state == 'UpToDate': node_names.add(resource_state.node_name) @@ -1026,6 +1068,14 @@ def find_up_to_date_diskfull_nodes(self, volume_uuid): return (node_names, in_use) + def invalidate_resource_cache(self): + """ + If resources are impacted by external commands like vhdutil, + it's necessary to call this function to invalidate current resource + cache. + """ + self._mark_resource_cache_as_dirty() + @classmethod def create_sr( cls, uri, group_name, node_names, redundancy, @@ -1149,6 +1199,12 @@ def create_sr( instance._redundancy = redundancy instance._group_name = group_name instance._volumes = set() + instance._storage_pools_time = 0 + instance._kv_cache = instance._create_kv_cache() + instance._resource_cache = None + instance._resource_cache_dirty = True + instance._volume_info_cache = None + instance._volume_info_cache_dirty = True return instance @classmethod @@ -1196,6 +1252,32 @@ def round_down_volume_size(cls, volume_size): # Private helpers. # -------------------------------------------------------------------------- + def _create_kv_cache(self): + self._kv_cache = self._create_linstor_kv('/') + self._kv_cache_dirty = False + return self._kv_cache + + def _get_kv_cache(self): + if self._kv_cache_dirty: + self._kv_cache = self._create_kv_cache() + return self._kv_cache + + def _create_resource_cache(self): + self._resource_cache = self._linstor.resource_list_raise() + self._resource_cache_dirty = False + return self._resource_cache + + def _get_resource_cache(self): + if self._resource_cache_dirty: + self._resource_cache = self._create_resource_cache() + return self._resource_cache + + def _mark_resource_cache_as_dirty(self): + self._resource_cache_dirty = True + self._volume_info_cache_dirty = True + + # -------------------------------------------------------------------------- + def _ensure_volume_exists(self, volume_uuid): if volume_uuid not in self._volumes: raise LinstorVolumeManagerError( @@ -1224,12 +1306,13 @@ def _fetch_resource_names(self): resource_names.add(dfn.name) return resource_names - def _get_volumes_info(self, filter=None): + def _get_volumes_info(self, volume_name=None): all_volume_info = {} - resources = self._linstor.resource_list_raise( - filter_by_resources=filter - ) - for resource in resources.resources: + + if not self._volume_info_cache_dirty: + return self._volume_info_cache + + for resource in self._get_resource_cache().resources: if resource.name not in all_volume_info: current = all_volume_info[resource.name] = self.VolumeInfo( resource.name @@ -1261,6 +1344,9 @@ def _get_volumes_info(self, filter=None): current.physical_size *= 1024 current.virtual_size *= 1024 + self._volume_info_cache_dirty = False + self._volume_info_cache = all_volume_info + return all_volume_info def _get_volume_node_names_and_size(self, volume_name): @@ -1289,12 +1375,8 @@ def _get_volume_node_names_and_size(self, volume_name): return (node_names, size * 1024) def _compute_size(self, attr): - pools = self._linstor.storage_pool_list_raise( - filter_by_stor_pools=[self._group_name] - ).storage_pools - capacity = 0 - for pool in pools: + for pool in self._get_storage_pools(force=True): space = pool.free_space if space: size = getattr(space, attr) @@ -1308,13 +1390,22 @@ def _compute_size(self, attr): def _get_node_names(self): node_names = set() - pools = self._linstor.storage_pool_list_raise( - filter_by_stor_pools=[self._group_name] - ).storage_pools - for pool in pools: + for pool in self._get_storage_pools(): node_names.add(pool.node_name) return node_names + def _get_storage_pools(self, force=False): + cur_time = time.time() + elsaped_time = cur_time - self._storage_pools_time + + if force or elsaped_time >= self.STORAGE_POOLS_FETCH_INTERVAL: + self._storage_pools = self._linstor.storage_pool_list_raise( + filter_by_stor_pools=[self._group_name] + ).storage_pools + self._storage_pools_time = time.time() + + return self._storage_pools + def _check_volume_creation_errors(self, result, volume_uuid): errors = self._filter_errors(result) if self._check_errors(errors, [ @@ -1338,6 +1429,7 @@ def _check_volume_creation_errors(self, result, volume_uuid): def _create_volume(self, volume_uuid, volume_name, size, place_resources): size = self.round_up_volume_size(size) + self._mark_resource_cache_as_dirty() self._check_volume_creation_errors(self._linstor.resource_group_spawn( rsc_grp_name=self._group_name, rsc_dfn_name=volume_name, @@ -1378,6 +1470,8 @@ def _create_volume_with_properties( volume_uuid, volume_name, size, place_resources ) + assert volume_properties.namespace == \ + self._build_volume_namespace(volume_uuid) return volume_properties except LinstorVolumeManagerError as e: # Do not destroy existing resource! @@ -1387,10 +1481,10 @@ def _create_volume_with_properties( # call in another host. if e.code == LinstorVolumeManagerError.ERR_VOLUME_EXISTS: raise - self._force_destroy_volume(volume_uuid, volume_properties) + self._force_destroy_volume(volume_uuid) raise except Exception: - self._force_destroy_volume(volume_uuid, volume_properties) + self._force_destroy_volume(volume_uuid) raise def _find_device_path(self, volume_uuid, volume_name): @@ -1417,34 +1511,26 @@ def _find_device_path(self, volume_uuid, volume_name): def _request_device_path(self, volume_uuid, volume_name, activate=False): node_name = socket.gethostname() - resources = self._linstor.resource_list( - filter_by_nodes=[node_name], - filter_by_resources=[volume_name] + + resources = filter( + lambda resource: resource.node_name == node_name and + resource.name == volume_name, + self._get_resource_cache().resources ) - if not resources or not resources[0]: + if not resources: + if activate: + self._activate_device_path(node_name, volume_name) + return self._request_device_path(volume_uuid, volume_name) raise LinstorVolumeManagerError( - 'No response list for dev path of `{}`'.format(volume_uuid) - ) - if isinstance(resources[0], linstor.responses.ResourceResponse): - if not resources[0].resources: - if activate: - self._activate_device_path(node_name, volume_name) - return self._request_device_path(volume_uuid, volume_name) - raise LinstorVolumeManagerError( - 'Empty dev path for `{}`, but definition "seems" to exist' - .format(volume_uuid) - ) - # Contains a path of the /dev/drbd form. - return resources[0].resources[0].volumes[0].device_path - - raise LinstorVolumeManagerError( - 'Unable to get volume dev path `{}`: {}'.format( - volume_uuid, str(resources[0]) + 'Empty dev path for `{}`, but definition "seems" to exist' + .format(volume_uuid) ) - ) + # Contains a path of the /dev/drbd form. + return resources[0].volumes[0].device_path def _activate_device_path(self, node_name, volume_name): + self._mark_resource_cache_as_dirty() result = self._linstor.resource_create([ linstor.ResourceData(node_name, volume_name, diskless=True) ]) @@ -1463,6 +1549,7 @@ def _activate_device_path(self, node_name, volume_name): ) def _destroy_resource(self, resource_name): + self._mark_resource_cache_as_dirty() result = self._linstor.resource_dfn_delete(resource_name) error_str = self._get_error_str(result) if error_str: @@ -1471,10 +1558,8 @@ def _destroy_resource(self, resource_name): .format(resource_name, self._group_name, error_str) ) - def _destroy_volume(self, volume_uuid, volume_properties): - assert volume_properties.namespace == \ - self._build_volume_namespace(volume_uuid) - + def _destroy_volume(self, volume_uuid): + volume_properties = self._get_volume_properties(volume_uuid) try: volume_name = volume_properties.get(self.PROP_VOLUME_NAME) if volume_name in self._fetch_resource_names(): @@ -1487,19 +1572,14 @@ def _destroy_volume(self, volume_uuid, volume_properties): 'Cannot destroy volume `{}`: {}'.format(volume_uuid, e) ) - def _force_destroy_volume(self, volume_uuid, volume_properties): + def _force_destroy_volume(self, volume_uuid): try: - self._destroy_volume(volume_uuid, volume_properties) + self._destroy_volume(volume_uuid) except Exception as e: self._logger('Ignore fail: {}'.format(e)) def _build_volumes(self, repair): - properties = linstor.KV( - self._get_store_name(), - uri=self._uri, - namespace=self._build_volume_namespace() - ) - + properties = self._kv_cache resource_names = self._fetch_resource_names() self._volumes = set() @@ -1517,9 +1597,7 @@ def _build_volumes(self, repair): self.REG_NOT_EXISTS, ignore_inexisting_volumes=False ) for volume_uuid, not_exists in existing_volumes.items(): - properties.namespace = self._build_volume_namespace( - volume_uuid - ) + properties.namespace = self._build_volume_namespace(volume_uuid) src_uuid = properties.get(self.PROP_UPDATING_UUID_SRC) if src_uuid: @@ -1580,36 +1658,31 @@ def _build_volumes(self, repair): ) for dest_uuid, src_uuid in updating_uuid_volumes.items(): - dest_properties = self._get_volume_properties(dest_uuid) - if int(dest_properties.get(self.PROP_NOT_EXISTS) or - self.STATE_EXISTS): - dest_properties.clear() + dest_namespace = self._build_volume_namespace(dest_uuid) + + properties.namespace = dest_namespace + if int(properties.get(self.PROP_NOT_EXISTS)): + properties.clear() continue - src_properties = self._get_volume_properties(src_uuid) - src_properties.clear() + properties.namespace = self._build_volume_namespace(src_uuid) + properties.clear() - dest_properties.pop(self.PROP_UPDATING_UUID_SRC) + properties.namespace = dest_namespace + properties.pop(self.PROP_UPDATING_UUID_SRC) if src_uuid in self._volumes: self._volumes.remove(src_uuid) self._volumes.add(dest_uuid) def _get_sr_properties(self): - return linstor.KV( - self._get_store_name(), - uri=self._uri, - namespace=self._build_sr_namespace() - ) + return self._create_linstor_kv(self._build_sr_namespace()) def _get_volumes_by_property( self, reg_prop, ignore_inexisting_volumes=True ): - base_properties = linstor.KV( - self._get_store_name(), - uri=self._uri, - namespace=self._build_volume_namespace() - ) + base_properties = self._get_kv_cache() + base_properties.namespace = self._build_volume_namespace() volume_properties = {} for volume_uuid in self._volumes: @@ -1625,13 +1698,18 @@ def _get_volumes_by_property( return volume_properties - def _get_volume_properties(self, volume_uuid): + def _create_linstor_kv(self, namespace): return linstor.KV( self._get_store_name(), uri=self._uri, - namespace=self._build_volume_namespace(volume_uuid) + namespace=namespace ) + def _get_volume_properties(self, volume_uuid): + properties = self._get_kv_cache() + properties.namespace = self._build_volume_namespace(volume_uuid) + return properties + def _get_store_name(self): return 'xcp-sr-{}'.format(self._group_name) From df211428026aed435b23c51baf3915846523b061 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Thu, 7 Jan 2021 11:17:08 +0100 Subject: [PATCH 024/133] feat(LinstorSR): robustify scan to avoid losing VDIs if function is called outside module Signed-off-by: Ronan Abhamon --- drivers/LinstorSR.py | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index 548f4b17..52131a59 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -325,6 +325,7 @@ def load(self, sr_uuid): self._initialized = False + self._vdis_loaded = False self._all_volume_info_cache = None self._all_volume_metadata_cache = None @@ -463,19 +464,13 @@ def wrap(self, *args, **kwargs): ) if load_vdis: - # We use a cache to avoid repeated JSON parsing. - # The performance gain is not big but we can still - # enjoy it with a few lines. - self._create_linstor_cache() self._load_vdis() - self._destroy_linstor_cache() - self._undo_all_journal_transactions() self._linstor.remove_resourceless_volumes() self._synchronize_metadata() except Exception as e: - if self.cmd == 'sr_scan': + if self.cmd == 'sr_scan' or self.cmd == 'sr_attach': # Always raise, we don't want to remove VDIs # from the XAPI database otherwise. raise e @@ -612,6 +607,9 @@ def scan(self, uuid): opterr='no such volume group: {}'.format(self._group_name) ) + # Note: `scan` can be called outside this module, so ensure the VDIs + # are loaded. + self._load_vdis() self._update_physical_size() for vdi_uuid in self.vdis.keys(): @@ -799,9 +797,22 @@ def _update_physical_size(self): # -------------------------------------------------------------------------- def _load_vdis(self): - if self.vdis: + if self._vdis_loaded: return + self._vdis_loaded = True + + assert self._is_master + + # We use a cache to avoid repeated JSON parsing. + # The performance gain is not big but we can still + # enjoy it with a few lines. + self._create_linstor_cache() + self._load_vdis_ex() + self._destroy_linstor_cache() + + self._undo_all_journal_transactions() + def _load_vdis_ex(self): # 1. Get existing VDIs in XAPI. xenapi = self.session.xenapi xapi_vdi_uuids = set() @@ -822,7 +833,8 @@ def _load_vdis(self): introduce = False - if self.cmd == 'sr_scan': + # Try to introduce VDIs only during scan/attach. + if self.cmd == 'sr_scan' or self.cmd == 'sr_attach': has_clone_entries = list(self._journaler.get_all( LinstorJournaler.CLONE ).items()) From 7f6b21fab7e8931723703dd67f1f02a97291d8ba Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Fri, 8 Jan 2021 16:12:15 +0100 Subject: [PATCH 025/133] feat(LinstorSR): display a correctly readable size for the user Signed-off-by: Ronan Abhamon --- drivers/LinstorSR.py | 46 +++++++++------------ drivers/linstorvolumemanager.py | 72 +++++++++++++++++++++++++++++---- 2 files changed, 83 insertions(+), 35 deletions(-) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index 52131a59..16cb0d62 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -770,27 +770,19 @@ def _update_stats(self, virt_alloc_delta): # Update size attributes of the SR parent class. self.virtual_allocation = valloc + virt_alloc_delta - # Physical size contains the total physical size. - # i.e. the sum of the sizes of all devices on all hosts, not the AVG. self._update_physical_size() # Notify SR parent class. self._db_update() def _update_physical_size(self): - # Physical size contains the total physical size. - # i.e. the sum of the sizes of all devices on all hosts, not the AVG. - self.physical_size = self._linstor.physical_size + # We use the size of the smallest disk, this is an approximation that + # ensures the displayed physical size is reachable by the user. + self.physical_size = \ + self._linstor.min_physical_size * len(self._hosts) / \ + self._redundancy - # `self._linstor.physical_free_size` contains the total physical free - # memory. If Thin provisioning is used we can't use it, we must use - # LINSTOR volume size to gives a good idea of the required - # usable memory to the users. - self.physical_utilisation = self._linstor.total_allocated_volume_size - - # If Thick provisioning is used, we can use this line instead: - # self.physical_utilisation = \ - # self.physical_size - self._linstor.physical_free_size + self.physical_utilisation = self._linstor.allocated_volume_size # -------------------------------------------------------------------------- # VDIs. @@ -912,10 +904,10 @@ def _load_vdis_ex(self): util.SMlog( 'Introducing VDI {} '.format(vdi_uuid) + - ' (name={}, virtual_size={}, physical_size={})'.format( + ' (name={}, virtual_size={}, allocated_size={})'.format( name_label, volume_info.virtual_size, - volume_info.physical_size + volume_info.allocated_size ) ) @@ -933,7 +925,7 @@ def _load_vdis_ex(self): sm_config, managed, str(volume_info.virtual_size), - str(volume_info.physical_size) + str(volume_info.allocated_size) ) is_a_snapshot = volume_metadata.get(IS_A_SNAPSHOT_TAG) @@ -1016,7 +1008,7 @@ def _load_vdis_ex(self): else: geneology[vdi.parent] = [vdi_uuid] if not vdi.hidden: - self.virtual_allocation += vdi.utilisation + self.virtual_allocation += vdi.size # 9. Remove all hidden leaf nodes to avoid introducing records that # will be GC'ed. @@ -1453,11 +1445,11 @@ def create(self, sr_uuid, vdi_uuid, size): '{}'.format(e) ) - self.utilisation = volume_info.physical_size + self.utilisation = volume_info.allocated_size self.sm_config['vdi_type'] = self.vdi_type self.ref = self._db_introduce() - self.sr._update_stats(volume_info.virtual_size) + self.sr._update_stats(self.size) return VDI.VDI.get_params(self) @@ -1496,7 +1488,7 @@ def delete(self, sr_uuid, vdi_uuid, data_only=False): del self.sr.vdis[self.uuid] # TODO: Check size after delete. - self.sr._update_stats(-self.capacity) + self.sr._update_stats(-self.size) self.sr._kick_gc() return super(LinstorVDI, self).delete(sr_uuid, vdi_uuid, data_only) @@ -1622,7 +1614,7 @@ def resize(self, sr_uuid, vdi_uuid, size): space_needed = new_volume_size - old_volume_size self.sr._ensure_space_available(space_needed) - old_capacity = self.capacity + old_size = self.size if self.vdi_type == vhdutil.VDI_TYPE_RAW: self._linstor.resize(self.uuid, new_volume_size) else: @@ -1641,7 +1633,7 @@ def resize(self, sr_uuid, vdi_uuid, size): self.session.xenapi.VDI.set_physical_utilisation( vdi_ref, str(self.utilisation) ) - self.sr._update_stats(self.capacity - old_capacity) + self.sr._update_stats(self.size - old_size) return VDI.VDI.get_params(self) def clone(self, sr_uuid, vdi_uuid): @@ -1756,13 +1748,13 @@ def _load_this(self): if volume_info is None: volume_info = self._linstor.get_volume_info(self.uuid) - # Contains the physical size used on all disks. + # Contains the max physical size used on a disk. # When LINSTOR LVM driver is used, the size should be similar to # virtual size (i.e. the LINSTOR max volume size). # When LINSTOR Thin LVM driver is used, the used physical size should # be lower than virtual size at creation. # The physical size increases after each write in a new block. - self.utilisation = volume_info.physical_size + self.utilisation = volume_info.allocated_size self.capacity = volume_info.virtual_size if self.vdi_type == vhdutil.VDI_TYPE_RAW: @@ -1958,7 +1950,7 @@ def _create_snapshot(self, snap_uuid, snap_of_uuid=None): volume_info = self._linstor.get_volume_info(snap_uuid) snap_vdi.size = self.sr._vhdutil.get_size_virt(snap_uuid) - snap_vdi.utilisation = volume_info.physical_size + snap_vdi.utilisation = volume_info.allocated_size # 6. Update sm config. snap_vdi.sm_config = {} @@ -2156,7 +2148,7 @@ def _snapshot(self, snap_type, cbtlog=None, cbt_consistency=None): raise if snap_type != VDI.SNAPSHOT_INTERNAL: - self.sr._update_stats(self.capacity) + self.sr._update_stats(self.size) # 10. Return info on the new user-visible leaf VDI. ret_vdi = snap_vdi diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py index d617655f..a6f67d8d 100755 --- a/drivers/linstorvolumemanager.py +++ b/drivers/linstorvolumemanager.py @@ -131,20 +131,19 @@ def default_logger(*args): class VolumeInfo(object): __slots__ = ( 'name', - 'physical_size', # Total physical size used by this volume on - # all disks. + 'allocated_size', # Allocated size, place count is not used. 'virtual_size' # Total virtual available size of this volume # (i.e. the user size at creation). ) def __init__(self, name): self.name = name - self.physical_size = 0 + self.allocated_size = 0 self.virtual_size = 0 def __repr__(self): return 'VolumeInfo("{}", {}, {})'.format( - self.name, self.physical_size, self.virtual_size + self.name, self.allocated_size, self.virtual_size ) # -------------------------------------------------------------------------- @@ -248,9 +247,31 @@ def physical_free_size(self): return self._compute_size('free_capacity') @property - def total_allocated_volume_size(self): + def min_physical_size(self): """ - Give the sum of all created volumes. + Give the minimum physical size of the SR. + I.e. the size of the smallest disk. + :return: The physical min size. + :rtype: int + """ + size = None + for pool in self._get_storage_pools(force=True): + space = pool.free_space + if space: + current_size = space.total_capacity + if current_size < 0: + raise LinstorVolumeManagerError( + 'Failed to get pool total_capacity attr of `{}`' + .format(pool.node_name) + ) + if size is None or current_size < size: + size = current_size + return size * 1024 + + @property + def total_volume_size(self): + """ + Give the sum of all created volumes. The place count is used. :return: The physical required size to use the volumes. :rtype: int """ @@ -269,6 +290,37 @@ def total_allocated_volume_size(self): size += current_size return size * 1024 + @property + def allocated_volume_size(self): + """ + Give the allocated size for all volumes. The place count is not + used here. When thick lvm is used, the size for one volume should + be equal to the virtual volume size. With thin lvm, the size is equal + or lower to the volume size. + :return: The allocated size of all volumes. + :rtype: int + """ + + size = 0 + for resource in self._get_resource_cache().resources: + volume_size = None + for volume in resource.volumes: + # We ignore diskless pools of the form "DfltDisklessStorPool". + if volume.storage_pool_name == self._group_name: + current_size = volume.allocated_size + if current_size < 0: + raise LinstorVolumeManagerError( + 'Failed to get allocated size of `{}` on `{}`' + .format(resource.name, volume.storage_pool_name) + ) + + if volume_size is None or current_size > volume_size: + volume_size = current_size + if volume_size is not None: + size += volume_size + + return size * 1024 + @property def metadata(self): """ @@ -1328,7 +1380,11 @@ def _get_volumes_info(self, volume_name=None): 'Failed to get allocated size of `{}` on `{}`' .format(resource.name, volume.storage_pool_name) ) - current.physical_size += volume.allocated_size + allocated_size = volume.allocated_size + + current.allocated_size = current.allocated_size and \ + max(current.allocated_size, allocated_size) or \ + allocated_size if volume.usable_size < 0: raise LinstorVolumeManagerError( @@ -1341,7 +1397,7 @@ def _get_volumes_info(self, volume_name=None): min(current.virtual_size, virtual_size) or virtual_size for current in all_volume_info.values(): - current.physical_size *= 1024 + current.allocated_size *= 1024 current.virtual_size *= 1024 self._volume_info_cache_dirty = False From c6ecf4e3aea57f56c6c9730467da3f3fd5f3c719 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Tue, 12 Jan 2021 14:06:34 +0100 Subject: [PATCH 026/133] feat(linstor-monitord): scan all LINSTOR SRs every 12 minutes to update allocated size stats Signed-off-by: Ronan Abhamon --- linstor/linstor-monitord.c | 194 ++++++++++++++++++++++++++++++++----- 1 file changed, 171 insertions(+), 23 deletions(-) diff --git a/linstor/linstor-monitord.c b/linstor/linstor-monitord.c index 8161813d..a1592fda 100644 --- a/linstor/linstor-monitord.c +++ b/linstor/linstor-monitord.c @@ -14,8 +14,10 @@ * along with this program. If not, see . */ +#include #include #include +#include #include #include #include @@ -39,7 +41,8 @@ #define POOL_CONF_ABS_FILE POOL_CONF_DIR "/" POOL_CONF_FILE // In milliseconds. -#define POLL_TIMEOUT 2000 +#define UPDATE_LINSTOR_NODE_TIMEOUT 2000 +#define SR_SCAN_TIMEOUT 720000 // ----------------------------------------------------------------------------- @@ -130,24 +133,120 @@ static inline int isMasterHost (int *error) { typedef struct { int inotifyFd; + struct timespec lastScanTime; + int isMaster; // TODO: Should be completed with at least a hostname field. } State; // ----------------------------------------------------------------------------- -static inline int execCommand (char *argv[]) { +typedef struct { + char *data; + size_t size; + size_t capacity; +} Buffer; + +#define max(a, b) ({ \ + __typeof__(a) _a = (a); \ + __typeof__(b) _b = (b); \ + _a > _b ? _a : _b; \ +}) + +static inline ssize_t readAll (int fd, Buffer *buffer) { + assert(buffer->capacity >= buffer->size); + + ssize_t ret = 0; + do { + size_t byteCount = buffer->capacity - buffer->size; + if (byteCount < 16) { + const size_t newCapacity = max(buffer->capacity << 1, 64); + char *p = realloc(buffer->data, newCapacity); + if (!p) + return -errno; + + buffer->data = p; + buffer->capacity = newCapacity; + + byteCount = buffer->capacity - buffer->size; + } + + ret = read(fd, buffer->data + buffer->size, byteCount); + if (ret > 0) + buffer->size += ret; + else if (ret < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) + ret = 0; + } while (ret > 0); + + return ret; +} + +// ----------------------------------------------------------------------------- + +static inline int execCommand (char *argv[], Buffer *buffer) { + int pipefd[2]; + if (buffer) { + if (pipe(pipefd) < 0) { + syslog(LOG_ERR, "Failed to exec pipe: `%s`.", strerror(errno)); + return -errno; + } + + if (fcntl(pipefd[0], F_SETFL, O_NONBLOCK) < 0) { + syslog(LOG_ERR, "Failed to exec fcntl on pipe in: `%s`.", strerror(errno)); + close(pipefd[0]); + close(pipefd[1]); + return -errno; + } + } + const pid_t pid = fork(); - if (pid < 0) + if (pid < 0) { + syslog(LOG_ERR, "Failed to fork: `%s`.", strerror(errno)); + if (buffer) { + close(pipefd[0]); + close(pipefd[1]); + } return -errno; + } // Child process. if (pid == 0) { + if (buffer) { + close(STDOUT_FILENO); + dup(pipefd[1]); + + close(pipefd[0]); + close(pipefd[1]); + } + if (execvp(*argv, argv) < 0) syslog(LOG_ERR, "Failed to exec `%s` command.", *argv); exit(EXIT_FAILURE); } // Main process. + int ret = 0; + if (buffer) { + close(pipefd[1]); + + do { + struct pollfd fds = { pipefd[0], POLLIN | POLLHUP, 0 }; + const int res = poll(&fds, 1, 0); + if (res < 0) { + if (errno == EAGAIN) + continue; + syslog(LOG_ERR, "Failed to poll from command: `%s`.", strerror(errno)); + ret = -errno; + } else if (res > 0) { + if (fds.revents & POLLIN) + ret = readAll(pipefd[0], buffer); + if (fds.revents & POLLHUP) + break; // Input has been closed. + } + } while (ret >= 0); + + close(pipefd[0]); + } + int status; if (waitpid(pid, &status, 0) < 0) { syslog(LOG_ERR, "Failed to wait command: `%s`.", *argv); @@ -163,7 +262,7 @@ static inline int execCommand (char *argv[]) { } else if (WIFSIGNALED(status)) syslog(LOG_ERR, "`%s` terminated by signal %d.", *argv, WTERMSIG(status)); - return 0; + return ret; } // ----------------------------------------------------------------------------- @@ -188,12 +287,7 @@ static inline int addInotifyWatch (int inotifyFd, const char *filepath, uint32_t // ----------------------------------------------------------------------------- -static inline int updateLinstorServices () { - int error; - const int isMaster = isMasterHost(&error); - if (error) - return error; - +static inline int updateLinstorController (int isMaster) { syslog(LOG_INFO, "%s linstor-controller...", isMaster ? "Enabling" : "Disabling"); char *argv[] = { "systemctl", @@ -202,7 +296,7 @@ static inline int updateLinstorServices () { "linstor-controller", NULL }; - return execCommand(argv); + return execCommand(argv, NULL); } static inline int updateLinstorNode (State *state) { @@ -219,14 +313,53 @@ static inline int updateLinstorNode (State *state) { // ----------------------------------------------------------------------------- +#define UUID_PARAM "uuid=" +#define UUID_PARAM_LEN (sizeof(UUID_PARAM) - 1) +#define UUID_LENGTH 36 + +static inline void scanLinstorSr (const char *uuid) { + char uuidBuf[UUID_LENGTH + UUID_PARAM_LEN + 1] = UUID_PARAM; + strncpy(uuidBuf + UUID_PARAM_LEN, uuid, UUID_LENGTH); + uuidBuf[UUID_LENGTH + UUID_PARAM_LEN] = '\0'; + execCommand((char *[]){ "xe", "sr-scan", uuidBuf, NULL }, NULL); +} + +// Called to update the physical/virtual size used by LINSTOR SRs in XAPI DB. +static inline int scanLinstorSrs () { + Buffer srs = {}; + const int ret = execCommand((char *[]){ "xe", "sr-list", "type=linstor", "--minimal", NULL }, &srs); + if (ret) { + free(srs.data); + return ret; + } + + const char *end = srs.data + srs.size; + char *pos = srs.data; + for (char *off; (off = memchr(pos, ',', end - pos)); pos = off + 1) + if (off - pos == UUID_LENGTH) + scanLinstorSr(pos); + + if (end - pos >= UUID_LENGTH) { + for (--end; end - pos >= UUID_LENGTH && isspace(*end); --end) {} + if (isalnum(*end)) + scanLinstorSr(pos); + } + + free(srs.data); + + return 0; +} + +// ----------------------------------------------------------------------------- + #define PROCESS_MODE_DEFAULT 0 #define PROCESS_MODE_WAIT_FILE_CREATION 1 static inline int waitForPoolConfCreation (State *state, int *wdFile); -static inline int processPoolConfEvents (int inotifyFd, int wd, char **buffer, size_t *bufferSize, int mode, int *process) { +static inline int processPoolConfEvents (State *state, int wd, char **buffer, size_t *bufferSize, int mode, int *process) { size_t size = 0; - if (ioctl(inotifyFd, FIONREAD, (char *)&size) == -1) { + if (ioctl(state->inotifyFd, FIONREAD, (char *)&size) == -1) { syslog(LOG_ERR, "Failed to get buffer size from inotify descriptor: `%s`.", strerror(errno)); return -errno; } @@ -241,7 +374,7 @@ static inline int processPoolConfEvents (int inotifyFd, int wd, char **buffer, s *bufferSize = size; } - if ((size = (size_t)read(inotifyFd, *buffer, size)) == (size_t)-1) { + if ((size = (size_t)read(state->inotifyFd, *buffer, size)) == (size_t)-1) { syslog(LOG_ERR, "Failed to read buffer from inotify descriptor: `%s`.", strerror(errno)); return -errno; } @@ -280,10 +413,10 @@ static inline int processPoolConfEvents (int inotifyFd, int wd, char **buffer, s syslog(LOG_INFO, "Updating linstor services... (Inotify mask=%" PRIu32 ")", mask); if (mask & (IN_DELETE_SELF | IN_MOVE_SELF | IN_UNMOUNT)) { syslog(LOG_ERR, "Watched `" POOL_CONF_ABS_FILE "` file has been removed!"); - inotify_rm_watch(inotifyFd, wd); // Do not forget to remove watch to avoid leaks. + inotify_rm_watch(state->inotifyFd, wd); // Do not forget to remove watch to avoid leaks. return -EIO; } - ret = updateLinstorServices(); + ret = updateLinstorController(state->isMaster); } else { if (mask & (IN_CREATE | IN_MOVED_TO)) { syslog(LOG_ERR, "Watched `" POOL_CONF_ABS_FILE "` file has been recreated!"); @@ -303,16 +436,24 @@ static inline int waitAndProcessEvents (State *state, int wd, int mode) { struct timespec previousTime = getCurrentTime(); do { - struct timespec currentTime = getCurrentTime(); + const struct timespec currentTime = getCurrentTime(); const int64_t elapsedTime = convertToMilliseconds(getTimeDiff(¤tTime, &previousTime)); int timeout; - if (elapsedTime >= POLL_TIMEOUT) { + if (elapsedTime >= UPDATE_LINSTOR_NODE_TIMEOUT) { updateLinstorNode(state); - timeout = POLL_TIMEOUT; + timeout = UPDATE_LINSTOR_NODE_TIMEOUT; previousTime = getCurrentTime(); } else { - timeout = POLL_TIMEOUT - elapsedTime; + timeout = UPDATE_LINSTOR_NODE_TIMEOUT - elapsedTime; + } + + const int64_t elapsedScanTime = convertToMilliseconds(getTimeDiff(¤tTime, &state->lastScanTime)); + if (elapsedScanTime >= SR_SCAN_TIMEOUT) { + state->isMaster = isMasterHost(&ret); + if (state->isMaster) + scanLinstorSrs(); + state->lastScanTime = getCurrentTime(); } struct pollfd fds = { state->inotifyFd, POLLIN, 0 }; @@ -323,7 +464,9 @@ static inline int waitAndProcessEvents (State *state, int wd, int mode) { syslog(LOG_ERR, "Failed to poll from inotify descriptor: `%s`.", strerror(errno)); ret = -errno; } else if (res > 0) { - ret = processPoolConfEvents(state->inotifyFd, wd, &buffer, &bufferSize, mode, &process); + state->isMaster = isMasterHost(&ret); + if (!ret) + ret = processPoolConfEvents(state, wd, &buffer, &bufferSize, mode, &process); } } while (ret >= 0 && process); @@ -350,7 +493,10 @@ static inline int waitForPoolConfCreation (State *state, int *wdFile) { do { do { // Update LINSTOR services... - ret = updateLinstorServices(); + int ret; + state->isMaster = isMasterHost(&ret); + if (!ret) + ret = updateLinstorController(state->isMaster); // Ok we can't read the pool configuration file. // Maybe the file doesn't exist. Waiting its creation... @@ -378,7 +524,9 @@ int main (int argc, char *argv[]) { setlogmask(LOG_UPTO(LOG_INFO)); State state = { - .inotifyFd = -1 + .inotifyFd = -1, + .lastScanTime = getCurrentTime(), + .isMaster = 0 }; const int inotifyFd = createInotifyInstance(); From 2b9100727860fdea05883a914d9b33f3094b5edb Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Fri, 15 Jan 2021 17:01:05 +0100 Subject: [PATCH 027/133] fix(LinstorSR): call correctly method in _locked_load when vdi_attach_from_config is executed Signed-off-by: Ronan Abhamon --- drivers/LinstorSR.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index 16cb0d62..2df2d681 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -347,19 +347,12 @@ def wrap(self, *args, **kwargs): self._master_uri, self._group_name, logger=util.SMlog ) - try: - self._linstor = LinstorVolumeManager( - self._master_uri, - self._group_name, - logger=util.SMlog - ) - return - except Exception as e: - util.SMlog( - 'Ignore exception. Failed to build LINSTOR ' - 'instance without session: {}'.format(e) - ) - return + self._linstor = LinstorVolumeManager( + self._master_uri, + self._group_name, + logger=util.SMlog + ) + return method(self, *args, **kwargs) self._master_uri = 'linstor://{}'.format( util.get_master_rec(self.session)['address'] From 38e0e36bdd3a3877cae36606837c6862056c8d9a Mon Sep 17 00:00:00 2001 From: Wescoeur Date: Wed, 20 Jan 2021 18:04:26 +0100 Subject: [PATCH 028/133] feat(LinstorSR): integrate minidrbdcluster daemon Now, we can: - Start a controller on any node - Share the LINSTOR volume list using a specific volume "xcp-persistent-database" - Use the HA with "xcp-persistent-ha-statefile" and "xcp-persistent-redo-log" volumes - Create the nodes automatically during SR creation Signed-off-by: Ronan Abhamon --- Makefile | 14 + drivers/LinstorSR.py | 269 ++++--- drivers/cleanup.py | 12 +- drivers/linstor-manager | 111 ++- drivers/linstorjournaler.py | 40 +- drivers/linstorvolumemanager.py | 691 +++++++++++++++--- drivers/tapdisk-pause | 8 +- drivers/util.py | 21 - etc/minidrbdcluster.ini | 14 + .../linstor-satellite.service.d/override.conf | 5 + etc/systemd/system/var-lib-linstor.mount | 6 + linstor/linstor-monitord.c | 15 - scripts/minidrbdcluster | 171 +++++ systemd/minidrbdcluster.service | 18 + 14 files changed, 1124 insertions(+), 271 deletions(-) create mode 100644 etc/minidrbdcluster.ini create mode 100644 etc/systemd/system/linstor-satellite.service.d/override.conf create mode 100644 etc/systemd/system/var-lib-linstor.mount create mode 100755 scripts/minidrbdcluster create mode 100644 systemd/minidrbdcluster.service diff --git a/Makefile b/Makefile index 3357cbfb..a8d9253c 100755 --- a/Makefile +++ b/Makefile @@ -91,6 +91,7 @@ PLUGIN_SCRIPT_DEST := /etc/xapi.d/plugins/ LIBEXEC := /opt/xensource/libexec/ UDEV_RULES_DIR := /etc/udev/rules.d/ UDEV_SCRIPTS_DIR := /etc/udev/scripts/ +SYSTEMD_CONF_DIR := /etc/systemd/system/ SYSTEMD_SERVICE_DIR := /usr/lib/systemd/system/ INIT_DIR := /etc/rc.d/init.d/ MPATH_CONF_DIR := /etc/multipath.xenserver/ @@ -98,6 +99,7 @@ MPATH_CUSTOM_CONF_DIR := /etc/multipath/conf.d/ MODPROBE_DIR := /etc/modprobe.d/ EXTENSION_SCRIPT_DEST := /etc/xapi.d/extensions/ LOGROTATE_DIR := /etc/logrotate.d/ +MINI_DRBD_CLUSTER_CONF_DIR := /etc/ SM_STAGING := $(DESTDIR) SM_STAMP := $(MY_OBJ_DIR)/.staging_stamp @@ -146,11 +148,14 @@ install: precheck mkdir -p $(SM_STAGING)$(UDEV_RULES_DIR) mkdir -p $(SM_STAGING)$(UDEV_SCRIPTS_DIR) mkdir -p $(SM_STAGING)$(INIT_DIR) + mkdir -p $(SM_STAGING)$(SYSTEMD_CONF_DIR) + mkdir -p $(SM_STAGING)$(SYSTEMD_CONF_DIR)/linstor-satellite.service.d mkdir -p $(SM_STAGING)$(SYSTEMD_SERVICE_DIR) mkdir -p $(SM_STAGING)$(MPATH_CONF_DIR) mkdir -p $(SM_STAGING)$(MPATH_CUSTOM_CONF_DIR) mkdir -p $(SM_STAGING)$(MODPROBE_DIR) mkdir -p $(SM_STAGING)$(LOGROTATE_DIR) + mkdir -p $(SM_STAGING)$(MINI_DRBD_CLUSTER_CONF_DIR) mkdir -p $(SM_STAGING)$(DEBUG_DEST) mkdir -p $(SM_STAGING)$(BIN_DEST) mkdir -p $(SM_STAGING)$(MASTER_SCRIPT_DEST) @@ -174,6 +179,12 @@ install: precheck $(SM_STAGING)/$(SM_DEST) install -m 644 etc/logrotate.d/$(SMLOG_CONF) \ $(SM_STAGING)/$(LOGROTATE_DIR) + install -m 644 etc/systemd/system/linstor-satellite.service.d/override.conf \ + $(SM_STAGING)/$(SYSTEMD_CONF_DIR)/linstor-satellite.service.d/ + install -m 644 etc/systemd/system/var-lib-linstor.mount \ + $(SM_STAGING)/$(SYSTEMD_CONF_DIR) + install -m 644 etc/minidrbdcluster.ini \ + $(SM_STAGING)/$(MINI_DRBD_CLUSTER_CONF_DIR) install -m 644 etc/make-dummy-sr.service \ $(SM_STAGING)/$(SYSTEMD_SERVICE_DIR) install -m 644 systemd/xs-sm.service \ @@ -192,6 +203,8 @@ install: precheck $(SM_STAGING)/$(SYSTEMD_SERVICE_DIR) install -m 644 systemd/linstor-monitor.service \ $(SM_STAGING)/$(SYSTEMD_SERVICE_DIR) + install -m 644 systemd/minidrbdcluster.service \ + $(SM_STAGING)/$(SYSTEMD_SERVICE_DIR) for i in $(UDEV_RULES); do \ install -m 644 udev/$$i.rules \ $(SM_STAGING)$(UDEV_RULES_DIR); done @@ -241,6 +254,7 @@ install: precheck install -m 755 scripts/xe-getlunidentifier $(SM_STAGING)$(BIN_DEST) install -m 755 scripts/make-dummy-sr $(SM_STAGING)$(LIBEXEC) install -m 755 scripts/storage-init $(SM_STAGING)$(LIBEXEC) + install -m 755 scripts/minidrbdcluster $(SM_STAGING)$(LIBEXEC) .PHONY: clean clean: diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index 2df2d681..9650d712 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -19,8 +19,11 @@ try: from linstorjournaler import LinstorJournaler from linstorvhdutil import LinstorVhdUtil - from linstorvolumemanager \ - import LinstorVolumeManager, LinstorVolumeManagerError + from linstorvolumemanager import get_controller_uri + from linstorvolumemanager import get_controller_node_name + from linstorvolumemanager import LinstorVolumeManager + from linstorvolumemanager import LinstorVolumeManagerError + LINSTOR_AVAILABLE = True except ImportError: LINSTOR_AVAILABLE = False @@ -310,7 +313,7 @@ def load(self, sr_uuid): self.lock = Lock(vhdutil.LOCK_TYPE_SR, self.uuid) self.sr_vditype = SR.DEFAULT_TAP - self._hosts = self.dconf['hosts'].split(',') + self._hosts = list(set(self.dconf['hosts'].split(','))) self._redundancy = int(self.dconf['redundancy'] or 1) self._linstor = None # Ensure that LINSTOR attribute exists. self._journaler = None @@ -320,7 +323,6 @@ def load(self, sr_uuid): self._is_master = True self._group_name = self.dconf['group-name'] - self._master_uri = None self._vdi_shared_time = 0 self._initialized = False @@ -340,24 +342,18 @@ def wrap(self, *args, **kwargs): if self.srcmd.cmd == 'vdi_attach_from_config': # We must have a valid LINSTOR instance here without using # the XAPI. - self._master_uri = 'linstor://{}'.format( - util.get_master_address() - ) + controller_uri = get_controller_uri() self._journaler = LinstorJournaler( - self._master_uri, self._group_name, logger=util.SMlog + controller_uri, self._group_name, logger=util.SMlog ) self._linstor = LinstorVolumeManager( - self._master_uri, + controller_uri, self._group_name, logger=util.SMlog ) return method(self, *args, **kwargs) - self._master_uri = 'linstor://{}'.format( - util.get_master_rec(self.session)['address'] - ) - if not self._is_master: if self.cmd in [ 'sr_create', 'sr_delete', 'sr_update', 'sr_probe', @@ -376,43 +372,31 @@ def wrap(self, *args, **kwargs): self._shared_lock_vdi(self.srcmd.params['vdi_uuid']) self._vdi_shared_time = time.time() - self._journaler = LinstorJournaler( - self._master_uri, self._group_name, logger=util.SMlog - ) + if self.srcmd.cmd != 'sr_create' and self.srcmd.cmd != 'sr_detach': + try: + controller_uri = get_controller_uri() - # Ensure ports are opened and LINSTOR controller/satellite - # are activated. - if self.srcmd.cmd == 'sr_create': - # TODO: Disable if necessary - self._enable_linstor_on_all_hosts(status=True) + self._journaler = LinstorJournaler( + controller_uri, self._group_name, logger=util.SMlog + ) - try: - # Try to open SR if exists. - # We can repair only if we are on the master AND if - # we are trying to execute an exclusive operation. - # Otherwise we could try to delete a VDI being created or - # during a snapshot. An exclusive op is the guarantee that the - # SR is locked. - self._linstor = LinstorVolumeManager( - self._master_uri, - self._group_name, - repair=( - self._is_master and - self.srcmd.cmd in self.ops_exclusive - ), - logger=util.SMlog - ) - self._vhdutil = LinstorVhdUtil(self.session, self._linstor) - except Exception as e: - if self.srcmd.cmd == 'sr_create' or \ - self.srcmd.cmd == 'sr_detach': - # Ignore exception in this specific case: sr_create. - # At this moment the LinstorVolumeManager cannot be - # instantiated. Concerning the sr_detach command, we must - # ignore LINSTOR exceptions (if the volume group doesn't - # exist for example after a bad user action). - pass - else: + # Try to open SR if exists. + # We can repair only if we are on the master AND if + # we are trying to execute an exclusive operation. + # Otherwise we could try to delete a VDI being created or + # during a snapshot. An exclusive op is the guarantee that + # the SR is locked. + self._linstor = LinstorVolumeManager( + controller_uri, + self._group_name, + repair=( + self._is_master and + self.srcmd.cmd in self.ops_exclusive + ), + logger=util.SMlog + ) + self._vhdutil = LinstorVhdUtil(self.session, self._linstor) + except Exception as e: raise xs_errors.XenError('SRUnavailable', opterr=str(e)) if self._linstor: @@ -507,13 +491,44 @@ def create(self, uuid, size): opterr='group name must be unique' ) + if srs: + raise xs_errors.XenError( + 'LinstorSRCreate', + opterr='LINSTOR SR must be unique in a pool' + ) + + online_hosts = util.get_online_hosts(self.session) + if len(online_hosts) < len(self._hosts): + raise xs_errors.XenError( + 'LinstorSRCreate', + opterr='Not enough online hosts' + ) + + ips = {} + for host in online_hosts: + record = self.session.xenapi.host.get_record(host) + hostname = record['hostname'] + if hostname in self._hosts: + ips[hostname] = record['address'] + + if len(ips) != len(self._hosts): + raise xs_errors.XenError( + 'LinstorSRCreate', + opterr='Not enough online hosts' + ) + + # Ensure ports are opened and LINSTOR satellites + # are activated. In the same time the minidrbdcluster instances + # must be stopped. + self._prepare_sr_on_all_hosts(enabled=True) + # Create SR. # Throw if the SR already exists. try: self._linstor = LinstorVolumeManager.create_sr( - self._master_uri, self._group_name, self._hosts, + ips, self._redundancy, thin_provisioning=self._provisioning == 'thin', logger=util.SMlog @@ -523,30 +538,79 @@ def create(self, uuid, size): util.SMlog('Failed to create LINSTOR SR: {}'.format(e)) raise xs_errors.XenError('LinstorSRCreate', opterr=str(e)) + try: + util.SMlog( + "Finishing SR creation, enable minidrbdcluster on all hosts..." + ) + self._update_minidrbdcluster_on_all_hosts(enabled=True) + except Exception as e: + try: + self._linstor.destroy() + except Exception as e2: + util.SMlog( + 'Failed to destroy LINSTOR SR after creation fail: {}' + .format(e2) + ) + raise e + @_locked_load def delete(self, uuid): util.SMlog('LinstorSR.delete for {}'.format(self.uuid)) cleanup.gc_force(self.session, self.uuid) - if self.vdis: + if self.vdis or self._linstor._volumes: raise xs_errors.XenError('SRNotEmpty') - try: - # TODO: Use specific exceptions. If the LINSTOR group doesn't - # exist, we can remove it without problem. + node_name = get_controller_node_name() + if not node_name: + raise xs_errors.XenError( + 'LinstorSRDelete', + opterr='Cannot get controller node name' + ) - # TODO: Maybe remove all volumes unused by the SMAPI. - # We must ensure it's a safe idea... + host = None + if node_name == 'localhost': + host = util.get_this_host_ref(self.session) + else: + for slave in util.get_all_slaves(self.session): + r_name = self.session.xenapi.host.get_record(slave)['hostname'] + if r_name == node_name: + host = slave + break - self._linstor.destroy() - Lock.cleanupAll(self.uuid) + if not host: + raise xs_errors.XenError( + 'LinstorSRDelete', + opterr='Failed to find host with hostname: {}'.format( + node_name + ) + ) + + try: + self._update_minidrbdcluster_on_all_hosts(enabled=False) + + args = { + 'groupName': self._group_name, + } + self._exec_manager_command( + host, 'destroy', args, 'LinstorSRDelete' + ) except Exception as e: + try: + self._update_minidrbdcluster_on_all_hosts(enabled=True) + except Exception as e2: + util.SMlog( + 'Failed to restart minidrbdcluster after destroy fail: {}' + .format(e2) + ) util.SMlog('Failed to delete LINSTOR SR: {}'.format(e)) raise xs_errors.XenError( 'LinstorSRDelete', opterr=str(e) ) + Lock.cleanupAll(self.uuid) + @_locked_load def update(self, uuid): util.SMlog('LinstorSR.update for {}'.format(self.uuid)) @@ -626,10 +690,9 @@ def vdi(self, uuid): # -------------------------------------------------------------------------- def _shared_lock_vdi(self, vdi_uuid, locked=True): - pools = self.session.xenapi.pool.get_all() - master = self.session.xenapi.pool.get_master(pools[0]) + master = util.get_master_ref(self.session) - method = 'lockVdi' + command = 'lockVdi' args = { 'groupName': self._group_name, 'srUuid': self.uuid, @@ -654,48 +717,56 @@ def _shared_lock_vdi(self, vdi_uuid, locked=True): ) return - ret = self.session.xenapi.host.call_plugin( - master, self.MANAGER_PLUGIN, method, args - ) - util.SMlog( - 'call-plugin ({} with {}) returned: {}' - .format(method, args, ret) - ) - if ret == 'False': - raise xs_errors.XenError( - 'VDIUnavailable', - opterr='Plugin {} failed'.format(self.MANAGER_PLUGIN) - ) + self._exec_manager_command(master, command, args, 'VDIUnavailable') # -------------------------------------------------------------------------- # Network. # -------------------------------------------------------------------------- - def _enable_linstor(self, host, status): - method = 'enable' - args = {'enabled': str(bool(status))} - + def _exec_manager_command(self, host, command, args, error): ret = self.session.xenapi.host.call_plugin( - host, self.MANAGER_PLUGIN, method, args + host, self.MANAGER_PLUGIN, command, args ) util.SMlog( - 'call-plugin ({} with {}) returned: {}'.format(method, args, ret) + 'call-plugin ({}:{} with {}) returned: {}'.format( + self.MANAGER_PLUGIN, command, args, ret + ) ) if ret == 'False': raise xs_errors.XenError( - 'SRUnavailable', + error, opterr='Plugin {} failed'.format(self.MANAGER_PLUGIN) ) - def _enable_linstor_on_master(self, status): - pools = self.session.xenapi.pool.get_all() - master = self.session.xenapi.pool.get_master(pools[0]) - self._enable_linstor(master, status) + def _prepare_sr(self, host, enabled): + self._exec_manager_command( + host, + 'prepareSr' if enabled else 'releaseSr', + {}, + 'SRUnavailable' + ) + + def _prepare_sr_on_all_hosts(self, enabled): + master = util.get_master_ref(self.session) + self._prepare_sr(master, enabled) - def _enable_linstor_on_all_hosts(self, status): - self._enable_linstor_on_master(status) for slave in util.get_all_slaves(self.session): - self._enable_linstor(slave, status) + self._prepare_sr(slave, enabled) + + def _update_minidrbdcluster(self, host, enabled): + self._exec_manager_command( + host, + 'updateMinidrbdcluster', + {'enabled': str(enabled)}, + 'SRUnavailable' + ) + + def _update_minidrbdcluster_on_all_hosts(self, enabled): + master = util.get_master_ref(self.session) + self._update_minidrbdcluster(master, enabled) + + for slave in util.get_all_slaves(self.session): + self._update_minidrbdcluster(slave, enabled) # -------------------------------------------------------------------------- # Metadata. @@ -1384,8 +1455,15 @@ def create(self, sr_uuid, vdi_uuid, size): # 4. Create! failed = False try: + volume_name = None + if self.ty == 'ha_statefile': + volume_name = 'xcp-persistent-ha-statefile' + elif self.ty == 'redo_log': + volume_name = 'xcp-persistent-redo-log' + self._linstor.create_volume( - self.uuid, volume_size, persistent=False + self.uuid, volume_size, persistent=False, + volume_name=volume_name ) volume_info = self._linstor.get_volume_info(self.uuid) @@ -1822,25 +1900,14 @@ def _prepare_thin(self, attach): else: fn = 'attach' if attach else 'detach' - # We assume the first pool is always the one currently in use. - pools = self.session.xenapi.pool.get_all() - master = self.session.xenapi.pool.get_master(pools[0]) + master = util.get_master_ref(self.session) + args = { 'groupName': self.sr._group_name, 'srUuid': self.sr.uuid, 'vdiUuid': self.uuid } - ret = self.session.xenapi.host.call_plugin( - master, self.sr.MANAGER_PLUGIN, fn, args - ) - util.SMlog( - 'call-plugin ({} with {}) returned: {}'.format(fn, args, ret) - ) - if ret == 'False': - raise xs_errors.XenError( - 'VDIUnavailable', - opterr='Plugin {} failed'.format(self.sr.MANAGER_PLUGIN) - ) + self.sr._exec_manager_command(master, fn, args, 'VDIUnavailable') # Reload size attrs after inflate or deflate! self._load_this() diff --git a/drivers/cleanup.py b/drivers/cleanup.py index 74c4de07..0399389e 100755 --- a/drivers/cleanup.py +++ b/drivers/cleanup.py @@ -52,8 +52,10 @@ try: from linstorjournaler import LinstorJournaler from linstorvhdutil import LinstorVhdUtil - from linstorvolumemanager \ - import LinstorVolumeManager, LinstorVolumeManagerError + from linstorvolumemanager import get_controller_uri + from linstorvolumemanager import LinstorVolumeManager + from linstorvolumemanager import LinstorVolumeManagerError + LINSTOR_AVAILABLE = True except ImportError: LINSTOR_AVAILABLE = False @@ -2868,7 +2870,6 @@ def __init__(self, uuid, xapi, createLock, force): ) SR.__init__(self, uuid, xapi, createLock, force) - self._master_uri = 'linstor://localhost' self.path = LinstorVolumeManager.DEV_ROOT_PATH self._reloadLinstor() @@ -2913,12 +2914,13 @@ def _reloadLinstor(self): dconf = session.xenapi.PBD.get_device_config(pbd) group_name = dconf['group-name'] + controller_uri = get_controller_uri() self.journaler = LinstorJournaler( - self._master_uri, group_name, logger=util.SMlog + controller_uri, group_name, logger=util.SMlog ) self._linstor = LinstorVolumeManager( - self._master_uri, + controller_uri, group_name, repair=True, logger=util.SMlog diff --git a/drivers/linstor-manager b/drivers/linstor-manager index e7e58fd8..f82b73f2 100755 --- a/drivers/linstor-manager +++ b/drivers/linstor-manager @@ -22,7 +22,7 @@ import XenAPIPlugin sys.path.append('/opt/xensource/sm/') from linstorjournaler import LinstorJournaler -from linstorvolumemanager import LinstorVolumeManager +from linstorvolumemanager import get_controller_uri, LinstorVolumeManager from lock import Lock import json import LinstorSR @@ -34,10 +34,6 @@ FIREWALL_PORT_SCRIPT = '/etc/xapi.d/plugins/firewall-port' LINSTOR_PORTS = [3366, 3370, 3376, 3377, '7000:8000'] -def get_linstor_uri(session): - return 'linstor://{}'.format(util.get_master_rec(session)['address']) - - def update_port(port, open): fn = 'open' if open else 'close' args = ( @@ -55,23 +51,72 @@ def update_all_ports(open): update_port(port, open) -def update_service(start): +def enable_and_start_service(name, start): fn = 'enable' if start else 'disable' - args = ('systemctl', fn, '--now', 'linstor-satellite') + args = ('systemctl', fn, '--now', name) (ret, out, err) = util.doexec(args) if ret == 0: return - raise Exception('Failed to {} satellite: {} {}'.format(fn, out, err)) + raise Exception('Failed to {} {}: {} {}'.format(fn, name, out, err)) + + +def restart_service(name): + args = ('systemctl', 'restart', name) + (ret, out, err) = util.doexec(args) + if ret == 0: + return + raise Exception('Failed to restart {}: {} {}'.format(name, out, err)) + + +def stop_service(name): + args = ('systemctl', 'stop', name) + (ret, out, err) = util.doexec(args) + if ret == 0: + return + raise Exception('Failed to stop {}: {} {}'.format(name, out, err)) + + +def update_linstor_satellite_service(start): + enable_and_start_service('linstor-satellite', start) + + +def update_minidrbdcluster_service(start): + enable_and_start_service('minidrbdcluster', start) + + +def prepare_sr(session, args): + try: + update_all_ports(open=True) + # We don't want to enable and start minidrbdcluster daemon during + # SR creation. + update_minidrbdcluster_service(start=False) + update_linstor_satellite_service(start=True) + return str(True) + except Exception as e: + util.SMlog('linstor-manager:prepare_sr error: {}'.format(e)) + return str(False) -def enable(session, args): +def release_sr(session, args): + try: + update_linstor_satellite_service(start=False) + update_minidrbdcluster_service(start=False) + update_all_ports(open=False) + return str(True) + except Exception as e: + util.SMlog('linstor-manager:release_sr error: {}'.format(e)) + return str(False) + + +def update_minidrbdcluster(session, args): try: enabled = distutils.util.strtobool(args['enabled']) - update_all_ports(open=enabled) - update_service(start=enabled) + update_minidrbdcluster_service(start=enabled) return str(True) except Exception as e: - util.SMlog('linstor-manager:disable error: {}'.format(e)) + util.SMlog( + 'linstor-manager:update_minidrbdcluster error: {}'.format(e) + ) return str(False) @@ -81,12 +126,12 @@ def attach(session, args): vdi_uuid = args['vdiUuid'] group_name = args['groupName'] - linstor_uri = get_linstor_uri(session) + controller_uri = get_controller_uri() journaler = LinstorJournaler( - linstor_uri, group_name, logger=util.SMlog + controller_uri, group_name, logger=util.SMlog ) linstor = LinstorVolumeManager( - linstor_uri, + controller_uri, group_name, logger=util.SMlog ) @@ -104,7 +149,7 @@ def detach(session, args): group_name = args['groupName'] linstor = LinstorVolumeManager( - get_linstor_uri(session), + get_controller_uri(), group_name, logger=util.SMlog ) @@ -115,6 +160,29 @@ def detach(session, args): return str(False) +def destroy(session, args): + try: + group_name = args['groupName'] + + # When destroy is called, there are no running minidrbdcluster daemons. + # So the controllers are stopped too, we must start an instance. + restart_service('var-lib-linstor.mount') + restart_service('linstor-controller') + + linstor = LinstorVolumeManager( + 'linstor://localhost', + group_name, + logger=util.SMlog + ) + linstor.destroy() + return str(True) + except Exception as e: + stop_service('linstor-controller') + stop_service('var-lib-linstor.mount') + util.SMlog('linstor-manager:destroy error: {}'.format(e)) + return str(False) + + def check(session, args): try: device_path = args['devicePath'] @@ -133,7 +201,7 @@ def get_vhd_info(session, args): include_parent = distutils.util.strtobool(args['includeParent']) linstor = LinstorVolumeManager( - get_linstor_uri(session), + get_controller_uri(), group_name, logger=util.SMlog ) @@ -168,7 +236,7 @@ def get_parent(session, args): group_name = args['groupName'] linstor = LinstorVolumeManager( - get_linstor_uri(session), + get_controller_uri(), group_name, logger=util.SMlog ) @@ -244,7 +312,7 @@ def lock_vdi(session, args): lock.acquire() linstor = LinstorVolumeManager( - get_linstor_uri(session), + get_controller_uri(), group_name, logger=util.SMlog ) @@ -261,9 +329,12 @@ def lock_vdi(session, args): if __name__ == '__main__': XenAPIPlugin.dispatch({ - 'enable': enable, + 'prepareSr': prepare_sr, + 'releaseSr': release_sr, + 'updateMinidrbdcluster': update_minidrbdcluster, 'attach': attach, 'detach': detach, + 'destroy': destroy, 'check': check, 'getVHDInfo': get_vhd_info, 'hasParent': has_parent, diff --git a/drivers/linstorjournaler.py b/drivers/linstorjournaler.py index 74953305..285012ca 100755 --- a/drivers/linstorjournaler.py +++ b/drivers/linstorjournaler.py @@ -16,7 +16,7 @@ # -from linstorvolumemanager import LinstorVolumeManager +from linstorvolumemanager import get_controller_uri, LinstorVolumeManager import linstor import re import util @@ -52,20 +52,10 @@ def __init__(self, uri, group_name, logger=default_logger.__func__): self._namespace = '{}journal/'.format( LinstorVolumeManager._build_sr_namespace() ) - - def connect(): - self._journal = linstor.KV( - LinstorVolumeManager._build_group_name(group_name), - uri=uri, - namespace=self._namespace - ) - - util.retry( - connect, - maxretry=60, - exceptions=[linstor.errors.LinstorNetworkError] - ) self._logger = logger + self._journal = self._create_journal_instance( + uri, group_name, self._namespace + ) def create(self, type, identifier, value): # TODO: Maybe rename to 'add' in the future (in Citrix code too). @@ -150,6 +140,28 @@ def hasJournals(self, identifier): def _reset_namespace(self): self._journal.namespace = self._namespace + @classmethod + def _create_journal_instance(cls, uri, group_name, namespace): + def connect(uri): + if not uri: + uri = get_controller_uri() + return linstor.KV( + LinstorVolumeManager._build_group_name(group_name), + uri=uri, + namespace=namespace + ) + + try: + return connect(uri) + except linstor.errors.LinstorNetworkError: + pass + + return util.retry( + lambda: connect(None), + maxretry=10, + exceptions=[linstor.errors.LinstorNetworkError] + ) + @staticmethod def _get_key(type, identifier): return '{}/{}'.format(type, identifier) diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py index a6f67d8d..a383e327 100755 --- a/drivers/linstorvolumemanager.py +++ b/drivers/linstorvolumemanager.py @@ -16,15 +16,30 @@ # +import glob import json import linstor import os.path import re +import shutil import socket import time import util +import uuid +# Contains the data of the "/var/lib/linstor" directory. +DATABASE_VOLUME_NAME = 'xcp-persistent-database' +DATABASE_SIZE = 1 << 30 # 1GB. +DATABASE_PATH = '/var/lib/linstor' +DATABASE_MKFS = 'mkfs.ext4' + +REG_DRBDADM_PRIMARY = re.compile("([^\\s]+)\\s+role:Primary") +REG_DRBDSETUP_IP = re.compile('[^\\s]+\\s+(.*):.*$') + + +# ============================================================================== + def round_up(value, divisor): assert divisor divisor = int(divisor) @@ -37,6 +52,79 @@ def round_down(value, divisor): return value - (value % int(divisor)) +# ============================================================================== + +def get_remote_host_ip(node_name): + (ret, stdout, stderr) = util.doexec([ + 'drbdsetup', 'show', DATABASE_VOLUME_NAME, '--json' + ]) + if ret != 0: + return + + try: + conf = json.loads(stdout) + if not conf: + return + + for connection in conf[0]['connections']: + if connection['net']['_name'] == node_name: + value = connection['path']['_remote_host'] + res = REG_DRBDSETUP_IP.match(value) + if res: + return res.groups()[0] + break + except Exception: + pass + + +def _get_controller_uri(): + (ret, stdout, stderr) = util.doexec([ + 'drbdadm', 'status', DATABASE_VOLUME_NAME + ]) + if ret != 0: + return + + if stdout.startswith('{} role:Primary'.format(DATABASE_VOLUME_NAME)): + return 'linstor://localhost' + + res = REG_DRBDADM_PRIMARY.search(stdout) + if res: + node_name = res.groups()[0] + ip = get_remote_host_ip(node_name) + if ip: + return 'linstor://' + ip + + +def get_controller_uri(): + retries = 0 + while True: + uri = _get_controller_uri() + if uri: + return uri + + retries += 1 + if retries >= 10: + break + time.sleep(1) + + +def get_controller_node_name(): + (ret, stdout, stderr) = util.doexec([ + 'drbdadm', 'status', DATABASE_VOLUME_NAME + ]) + if ret != 0: + return + + if stdout.startswith('{} role:Primary'.format(DATABASE_VOLUME_NAME)): + return 'localhost' + + res = REG_DRBDADM_PRIMARY.search(stdout) + if res: + return res.groups()[0] + + +# ============================================================================== + class LinstorVolumeManagerError(Exception): ERR_GENERIC = 0, ERR_VOLUME_EXISTS = 1, @@ -50,6 +138,7 @@ def __init__(self, message, code=ERR_GENERIC): def code(self): return self._code + # ============================================================================== # Note: @@ -152,7 +241,7 @@ def __init__( self, uri, group_name, repair=False, logger=default_logger.__func__ ): """ - Create a new LinstorApi object. + Create a new LinstorVolumeManager object. :param str uri: URI to communicate with the LINSTOR controller. :param str group_name: The SR goup name to use. :param bool repair: If true we try to remove bad volumes due to a crash @@ -160,7 +249,6 @@ def __init__( :param function logger: Function to log messages. """ - self._uri = uri self._linstor = self._create_linstor_instance(uri) self._base_group_name = group_name @@ -266,7 +354,7 @@ def min_physical_size(self): ) if size is None or current_size < size: size = current_size - return size * 1024 + return (size or 0) * 1024 @property def total_volume_size(self): @@ -379,19 +467,24 @@ def check_volume_exists(self, volume_uuid): """ return volume_uuid in self._volumes - def create_volume(self, volume_uuid, size, persistent=True): + def create_volume( + self, volume_uuid, size, persistent=True, volume_name=None + ): """ Create a new volume on the SR. :param str volume_uuid: The volume uuid to use. :param int size: volume size in B. :param bool persistent: If false the volume will be unavailable on the next constructor call LinstorSR(...). + :param str volume_name: If set, this name is used in the LINSTOR + database instead of a generated name. :return: The current device path of the volume. :rtype: str """ self._logger('Creating LINSTOR volume {}...'.format(volume_uuid)) - volume_name = self.build_volume_name(util.gen_uuid()) + if not volume_name: + volume_name = self.build_volume_name(util.gen_uuid()) volume_properties = self._create_volume_with_properties( volume_uuid, volume_name, size, place_resources=True ) @@ -1073,23 +1166,56 @@ def remove_resourceless_volumes(self): if not volume_name or volume_name not in resource_names: self.destroy_volume(volume_uuid) - def destroy(self, force=False): + def destroy(self): """ Destroy this SR. Object should not be used after that. :param bool force: Try to destroy volumes before if true. """ - if (force): - for volume_uuid in self._volumes: - self.destroy_volume(volume_uuid) + if self._volumes: + raise LinstorVolumeManagerError( + 'Cannot destroy LINSTOR volume manager: ' + 'It exists remaining volumes' + ) - # TODO: Throw exceptions in the helpers below if necessary. - # TODO: What's the required action if it exists remaining volumes? + uri = 'linstor://localhost' + try: + self._start_controller(start=False) - self._destroy_resource_group(self._linstor, self._group_name) - for pool in self._get_storage_pools(force=True): - self._destroy_storage_pool( - self._linstor, pool.name, pool.node_name + # 1. Umount LINSTOR database. + self._mount_database_volume( + self.build_device_path(DATABASE_VOLUME_NAME), + mount=False, + force=True + ) + + # 2. Refresh instance. + self._start_controller(start=True) + self._linstor = self._create_linstor_instance( + uri, keep_uri_unmodified=True + ) + + # 3. Destroy database volume. + self._destroy_resource(DATABASE_VOLUME_NAME) + + # 4. Destroy group and storage pools. + self._destroy_resource_group(self._linstor, self._group_name) + for pool in self._get_storage_pools(force=True): + self._destroy_storage_pool( + self._linstor, pool.name, pool.node_name + ) + except Exception as e: + self._start_controller(start=True) + raise e + + try: + self._start_controller(start=False) + for file in glob.glob(DATABASE_PATH + '/'): + os.remove(file) + except Exception as e: + util.SMlog( + 'Ignoring failure after LINSTOR SR destruction: {}' + .format(e) ) def find_up_to_date_diskfull_nodes(self, volume_uuid): @@ -1130,29 +1256,75 @@ def invalidate_resource_cache(self): @classmethod def create_sr( - cls, uri, group_name, node_names, redundancy, + cls, group_name, node_names, ips, redundancy, thin_provisioning=False, logger=default_logger.__func__ ): """ Create a new SR on the given nodes. - :param str uri: URI to communicate with the LINSTOR controller. :param str group_name: The SR group_name to use. :param list[str] node_names: String list of nodes. :param int redundancy: How many copy of volumes should we store? + :param set(str) ips: Node ips :param function logger: Function to log messages. :return: A new LinstorSr instance. :rtype: LinstorSr """ + try: + cls._start_controller(start=True) + sr = cls._create_sr( + group_name, + node_names, + ips, + redundancy, + thin_provisioning, + logger + ) + finally: + # Controller must be stopped and volume unmounted because + # it is the role of the minidrbdcluster daemon to do the right + # actions. + cls._start_controller(start=False) + cls._mount_volume( + cls.build_device_path(DATABASE_VOLUME_NAME), + DATABASE_PATH, + mount=False + ) + return sr + + @classmethod + def _create_sr( + cls, group_name, node_names, ips, redundancy, + thin_provisioning=False, + logger=default_logger.__func__ + ): # 1. Check if SR already exists. - lin = cls._create_linstor_instance(uri) + uri = 'linstor://localhost' + + lin = cls._create_linstor_instance(uri, keep_uri_unmodified=True) + + for node_name in node_names: + ip = ips[node_name] + result = lin.node_create( + node_name, + linstor.consts.VAL_NODE_TYPE_CMBD, + ip + ) + errors = cls._filter_errors(result) + if cls._check_errors(errors, [linstor.consts.FAIL_EXISTS_NODE]): + continue + + if errors: + raise LinstorVolumeManagerError( + 'Failed to create node `{}` with ip `{}`: {}'.format( + node_name, ip, cls._get_error_str(errors) + ) + ) + driver_pool_name = group_name group_name = cls._build_group_name(group_name) pools = lin.storage_pool_list_raise(filter_by_stor_pools=[group_name]) - - # TODO: Maybe if the SR already exists and if the nodes are the same, - # we can try to use it directly. pools = pools.storage_pools if pools: existing_node_names = map(lambda pool: pool.node_name, pools) @@ -1227,25 +1399,64 @@ def create_sr( ) ) - # 3. Remove storage pools/resource/volume group in the case of errors. + # 3. Create the LINSTOR database volume and mount it. + try: + logger('Creating database volume...') + volume_path = cls._create_database_volume(lin, group_name) + except LinstorVolumeManagerError as e: + if e.code != LinstorVolumeManagerError.ERR_VOLUME_EXISTS: + logger('Destroying database volume after creation fail...') + cls._force_destroy_database_volume(lin, group_name) + raise + + try: + logger('Mounting database volume...') + + # First we must disable the controller to move safely the + # LINSTOR config. + cls._start_controller(start=False) + + cls._mount_database_volume(volume_path) + except Exception as e: + # Ensure we are connected because controller has been + # restarted during mount call. + logger('Destroying database volume after mount fail...') + + try: + cls._start_controller(start=True) + except Exception: + pass + + lin = cls._create_linstor_instance( + uri, keep_uri_unmodified=True + ) + cls._force_destroy_database_volume(lin, group_name) + raise e + + cls._start_controller(start=True) + lin = cls._create_linstor_instance(uri, keep_uri_unmodified=True) + + # 4. Remove storage pools/resource/volume group in the case of errors. except Exception as e: + logger('Destroying resource group and storage pools after fail...') try: cls._destroy_resource_group(lin, group_name) - except Exception: + except Exception as e2: + logger('Failed to destroy resource group: {}'.format(e2)) pass j = 0 i = min(i, len(node_names) - 1) while j <= i: try: cls._destroy_storage_pool(lin, group_name, node_names[j]) - except Exception: + except Exception as e2: + logger('Failed to destroy resource group: {}'.format(e2)) pass j += 1 raise e - # 4. Return new instance. + # 5. Return new instance. instance = cls.__new__(cls) - instance._uri = uri instance._linstor = lin instance._logger = logger instance._redundancy = redundancy @@ -1462,26 +1673,6 @@ def _get_storage_pools(self, force=False): return self._storage_pools - def _check_volume_creation_errors(self, result, volume_uuid): - errors = self._filter_errors(result) - if self._check_errors(errors, [ - linstor.consts.FAIL_EXISTS_RSC, linstor.consts.FAIL_EXISTS_RSC_DFN - ]): - raise LinstorVolumeManagerError( - 'Failed to create volume `{}` from SR `{}`, it already exists' - .format(volume_uuid, self._group_name), - LinstorVolumeManagerError.ERR_VOLUME_EXISTS - ) - - if errors: - raise LinstorVolumeManagerError( - 'Failed to create volume `{}` from SR `{}`: {}'.format( - volume_uuid, - self._group_name, - self._get_error_str(errors) - ) - ) - def _create_volume(self, volume_uuid, volume_name, size, place_resources): size = self.round_up_volume_size(size) @@ -1491,7 +1682,7 @@ def _create_volume(self, volume_uuid, volume_name, size, place_resources): rsc_dfn_name=volume_name, vlm_sizes=['{}B'.format(size)], definitions_only=not place_resources - ), volume_uuid) + ), volume_uuid, self._group_name) def _create_volume_with_properties( self, volume_uuid, volume_name, size, place_resources @@ -1535,12 +1726,8 @@ def _create_volume_with_properties( # before the `self._create_volume` case. # It can only happen if the same volume uuid is used in the same # call in another host. - if e.code == LinstorVolumeManagerError.ERR_VOLUME_EXISTS: - raise - self._force_destroy_volume(volume_uuid) - raise - except Exception: - self._force_destroy_volume(volume_uuid) + if e.code != LinstorVolumeManagerError.ERR_VOLUME_EXISTS: + self._force_destroy_volume(volume_uuid) raise def _find_device_path(self, volume_uuid, volume_name): @@ -1576,7 +1763,10 @@ def _request_device_path(self, volume_uuid, volume_name, activate=False): if not resources: if activate: - self._activate_device_path(node_name, volume_name) + self._mark_resource_cache_as_dirty() + self._activate_device_path( + self._linstor, node_name, volume_name + ) return self._request_device_path(volume_uuid, volume_name) raise LinstorVolumeManagerError( 'Empty dev path for `{}`, but definition "seems" to exist' @@ -1585,25 +1775,6 @@ def _request_device_path(self, volume_uuid, volume_name, activate=False): # Contains a path of the /dev/drbd form. return resources[0].volumes[0].device_path - def _activate_device_path(self, node_name, volume_name): - self._mark_resource_cache_as_dirty() - result = self._linstor.resource_create([ - linstor.ResourceData(node_name, volume_name, diskless=True) - ]) - if linstor.Linstor.all_api_responses_no_error(result): - return - errors = linstor.Linstor.filter_api_call_response_errors(result) - if len(errors) == 1 and errors[0].is_error( - linstor.consts.FAIL_EXISTS_RSC - ): - return - - raise LinstorVolumeManagerError( - 'Unable to activate device path of `{}` on node `{}`: {}' - .format(volume_name, node_name, ', '.join( - [str(x) for x in result])) - ) - def _destroy_resource(self, resource_name): self._mark_resource_cache_as_dirty() result = self._linstor.resource_dfn_delete(resource_name) @@ -1757,7 +1928,7 @@ def _get_volumes_by_property( def _create_linstor_kv(self, namespace): return linstor.KV( self._get_store_name(), - uri=self._uri, + uri=self._linstor.controller_host(), namespace=namespace ) @@ -1787,46 +1958,347 @@ def _get_error_str(cls, result): ]) @classmethod - def _create_linstor_instance(cls, uri): - def connect(): + def _create_linstor_instance(cls, uri, keep_uri_unmodified=False): + retry = False + + def connect(uri): + if not uri: + uri = get_controller_uri() + if not uri: + raise LinstorVolumeManagerError( + 'Unable to find controller uri...' + ) instance = linstor.Linstor(uri, keep_alive=True) instance.connect() return instance + try: + return connect(uri) + except (linstor.errors.LinstorNetworkError, LinstorVolumeManagerError): + pass + + if not keep_uri_unmodified: + uri = None + return util.retry( - connect, - maxretry=60, - exceptions=[linstor.errors.LinstorNetworkError] + lambda: connect(uri), + maxretry=10, + exceptions=[ + linstor.errors.LinstorNetworkError, + LinstorVolumeManagerError + ] ) @classmethod - def _destroy_storage_pool(cls, lin, group_name, node_name): - result = lin.storage_pool_delete(node_name, group_name) + def _activate_device_path(cls, lin, node_name, volume_name): + result = lin.resource_create([ + linstor.ResourceData(node_name, volume_name, diskless=True) + ]) + if linstor.Linstor.all_api_responses_no_error(result): + return + errors = linstor.Linstor.filter_api_call_response_errors(result) + if len(errors) == 1 and errors[0].is_error( + linstor.consts.FAIL_EXISTS_RSC + ): + return + + raise LinstorVolumeManagerError( + 'Unable to activate device path of `{}` on node `{}`: {}' + .format(volume_name, node_name, ', '.join( + [str(x) for x in result])) + ) + + @classmethod + def _request_database_path(cls, lin, activate=False): + node_name = socket.gethostname() + + try: + resources = filter( + lambda resource: resource.node_name == node_name and + resource.name == DATABASE_VOLUME_NAME, + lin.resource_list_raise().resources + ) + except Exception as e: + raise LinstorVolumeManagerError( + 'Unable to get resources during database creation: {}' + .format(e) + ) + + if not resources: + if activate: + cls._activate_device_path( + lin, node_name, DATABASE_VOLUME_NAME + ) + return cls._request_database_path( + DATABASE_VOLUME_NAME, DATABASE_VOLUME_NAME + ) + raise LinstorVolumeManagerError( + 'Empty dev path for `{}`, but definition "seems" to exist' + .format(DATABASE_PATH) + ) + # Contains a path of the /dev/drbd form. + return resources[0].volumes[0].device_path + + @classmethod + def _create_database_volume(cls, lin, group_name): + try: + dfns = lin.resource_dfn_list_raise().resource_definitions + except Exception as e: + raise LinstorVolumeManagerError( + 'Unable to get definitions during database creation: {}' + .format(e) + ) + + if dfns: + raise LinstorVolumeManagerError( + 'Could not create volume `{}` from SR `{}`, '.format( + DATABASE_VOLUME_NAME, group_name + ) + 'LINSTOR volume list must be empty.' + ) + + size = cls.round_up_volume_size(DATABASE_SIZE) + cls._check_volume_creation_errors(lin.resource_group_spawn( + rsc_grp_name=group_name, + rsc_dfn_name=DATABASE_VOLUME_NAME, + vlm_sizes=['{}B'.format(size)], + definitions_only=False + ), DATABASE_VOLUME_NAME, group_name) + + # We must modify the quorum. Otherwise we can't use correctly the + # minidrbdcluster daemon. + result = lin.resource_dfn_modify(DATABASE_VOLUME_NAME, { + 'DrbdOptions/auto-quorum': 'disabled', + 'DrbdOptions/Resource/quorum': 'majority' + }) error_str = cls._get_error_str(result) if error_str: raise LinstorVolumeManagerError( - 'Failed to destroy SP `{}` on node `{}`: {}'.format( - group_name, - node_name, - error_str + 'Could not activate quorum on database volume: {}' + .format(error_str) + ) + + current_device_path = cls._request_database_path(lin, activate=True) + + # We use realpath here to get the /dev/drbd path instead of + # /dev/drbd/by-res/. + expected_device_path = cls.build_device_path(DATABASE_VOLUME_NAME) + util.wait_for_path(expected_device_path, 5) + + device_realpath = os.path.realpath(expected_device_path) + if current_device_path != device_realpath: + raise LinstorVolumeManagerError( + 'Invalid path, current={}, expected={} (realpath={})' + .format( + current_device_path, + expected_device_path, + device_realpath ) ) + try: + util.pread2([DATABASE_MKFS, expected_device_path]) + except Exception as e: + raise LinstorVolumeManagerError( + 'Failed to execute {} on database volume: {}' + .format(DATABASE_MKFS, e) + ) + + return expected_device_path + @classmethod - def _destroy_resource_group(cls, lin, group_name): - result = lin.resource_group_delete(group_name) - error_str = cls._get_error_str(result) + def _destroy_database_volume(cls, lin, group_name): + error_str = cls._get_error_str( + lin.resource_dfn_delete(DATABASE_VOLUME_NAME) + ) if error_str: raise LinstorVolumeManagerError( - 'Failed to destroy RG `{}`: {}'.format(group_name, error_str) + 'Could not destroy resource `{}` from SR `{}`: {}' + .format(DATABASE_VOLUME_NAME, group_name, error_str) ) + @classmethod + def _mount_database_volume(cls, volume_path, mount=True, force=False): + backup_path = DATABASE_PATH + '-' + str(uuid.uuid4()) + + try: + # 1. Create a backup config folder. + database_not_empty = bool(os.listdir(DATABASE_PATH)) + if database_not_empty: + try: + os.mkdir(backup_path) + except Exception as e: + raise LinstorVolumeManagerError( + 'Failed to create backup path {} of LINSTOR config: {}' + .format(backup_path, e) + ) + + # 2. Move the config in the mounted volume. + if database_not_empty: + cls._move_files(DATABASE_PATH, backup_path) + + cls._mount_volume(volume_path, DATABASE_PATH, mount) + + if database_not_empty: + cls._move_files(backup_path, DATABASE_PATH, force) + + # 3. Remove useless backup directory. + try: + os.rmdir(backup_path) + except Exception: + raise LinstorVolumeManagerError( + 'Failed to remove backup path {} of LINSTOR config {}' + .format(backup_path, e) + ) + except Exception as e: + def force_exec(fn): + try: + fn() + except Exception: + pass + + if mount == cls._is_mounted(DATABASE_PATH): + force_exec(lambda: cls._move_files( + DATABASE_PATH, backup_path + )) + force_exec(lambda: cls._mount_volume( + volume_path, DATABASE_PATH, not mount + )) + + if mount != cls._is_mounted(DATABASE_PATH): + force_exec(lambda: cls._move_files( + backup_path, DATABASE_PATH + )) + + force_exec(lambda: os.rmdir(backup_path)) + raise e + + @classmethod + def _force_destroy_database_volume(cls, lin, group_name): + try: + cls._destroy_database_volume(lin, group_name) + except Exception: + pass + + @classmethod + def _destroy_storage_pool(cls, lin, group_name, node_name): + def destroy(): + result = lin.storage_pool_delete(node_name, group_name) + errors = cls._filter_errors(result) + if cls._check_errors(errors, [ + linstor.consts.FAIL_NOT_FOUND_STOR_POOL, + linstor.consts.FAIL_NOT_FOUND_STOR_POOL_DFN + ]): + return + + if errors: + raise LinstorVolumeManagerError( + 'Failed to destroy SP `{}` on node `{}`: {}'.format( + group_name, + node_name, + cls._get_error_str(errors) + ) + ) + + # We must retry to avoid errors like: + # "can not be deleted as volumes / snapshot-volumes are still using it" + # after LINSTOR database volume destruction. + return util.retry(destroy, maxretry=10) + + @classmethod + def _destroy_resource_group(cls, lin, group_name): + def destroy(): + result = lin.resource_group_delete(group_name) + errors = cls._filter_errors(result) + if cls._check_errors(errors, [ + linstor.consts.FAIL_NOT_FOUND_RSC_GRP + ]): + return + + if errors: + raise LinstorVolumeManagerError( + 'Failed to destroy RG `{}`: {}' + .format(group_name, cls._get_error_str(errors)) + ) + + return util.retry(destroy, maxretry=10) + @classmethod def _build_group_name(cls, base_name): # If thin provisioning is used we have a path like this: # `VG/LV`. "/" is not accepted by LINSTOR. return '{}{}'.format(cls.PREFIX_SR, base_name.replace('/', '_')) + @classmethod + def _check_volume_creation_errors(cls, result, volume_uuid, group_name): + errors = cls._filter_errors(result) + if cls._check_errors(errors, [ + linstor.consts.FAIL_EXISTS_RSC, linstor.consts.FAIL_EXISTS_RSC_DFN + ]): + raise LinstorVolumeManagerError( + 'Failed to create volume `{}` from SR `{}`, it already exists' + .format(volume_uuid, group_name), + LinstorVolumeManagerError.ERR_VOLUME_EXISTS + ) + + if errors: + raise LinstorVolumeManagerError( + 'Failed to create volume `{}` from SR `{}`: {}'.format( + volume_uuid, + group_name, + cls._get_error_str(errors) + ) + ) + + @classmethod + def _move_files(cls, src_dir, dest_dir, force=False): + def listdir(dir): + ignored = ['lost+found'] + return filter(lambda file: file not in ignored, os.listdir(dir)) + + try: + if not force: + files = listdir(dest_dir) + if files: + raise LinstorVolumeManagerError( + 'Cannot move files from {} to {} because destination ' + 'contains: {}'.format(src_dir, dest_dir, files) + ) + except LinstorVolumeManagerError: + raise + except Exception as e: + raise LinstorVolumeManagerError( + 'Cannot list dir {}: {}'.format(dest_dir, e) + ) + + try: + for file in listdir(src_dir): + try: + dest_file = os.path.join(dest_dir, file) + if not force and os.path.exists(dest_file): + raise LinstorVolumeManagerError( + 'Cannot move {} because it already exists in the ' + 'destination'.format(file) + ) + shutil.move(os.path.join(src_dir, file), dest_file) + except LinstorVolumeManagerError: + raise + except Exception as e: + raise LinstorVolumeManagerError( + 'Cannot move {}: {}'.format(file, e) + ) + except Exception as e: + if not force: + try: + cls._move_files(dest_dir, src_dir, force=True) + except Exception: + pass + + raise LinstorVolumeManagerError( + 'Failed to move files from {} to {}: {}'.format( + src_dir, dest_dir, e + ) + ) + @staticmethod def _get_filtered_properties(properties): return dict(properties.items()) @@ -1845,3 +2317,44 @@ def _check_errors(result, codes): if err.is_error(code): return True return False + + @classmethod + def _start_controller(cls, start=True): + return cls._start_service('linstor-controller', start) + + @staticmethod + def _start_service(name, start=True): + action = 'start' if start else 'stop' + (ret, out, err) = util.doexec([ + 'systemctl', action, name + ]) + if ret != 0: + raise LinstorVolumeManagerError( + 'Failed to {} {}: {} {}' + .format(action, name, out, err) + ) + + @staticmethod + def _is_mounted(mountpoint): + (ret, out, err) = util.doexec(['mountpoint', '-q', mountpoint]) + return ret == 0 + + @classmethod + def _mount_volume(cls, volume_path, mountpoint, mount=True): + if mount: + try: + util.pread(['mount', volume_path, mountpoint]) + except Exception as e: + raise LinstorVolumeManagerError( + 'Failed to mount volume {} on {}: {}' + .format(volume_path, mountpoint, e) + ) + else: + try: + if cls._is_mounted(mountpoint): + util.pread(['umount', mountpoint]) + except Exception as e: + raise LinstorVolumeManagerError( + 'Failed to umount volume {} on {}: {}' + .format(volume_path, mountpoint, e) + ) diff --git a/drivers/tapdisk-pause b/drivers/tapdisk-pause index ed6abede..e0bca7be 100755 --- a/drivers/tapdisk-pause +++ b/drivers/tapdisk-pause @@ -30,7 +30,7 @@ import vhdutil import lvmcache try: - from linstorvolumemanager import LinstorVolumeManager + from linstorvolumemanager import get_controller_uri, LinstorVolumeManager LINSTOR_AVAILABLE = True except ImportError: LINSTOR_AVAILABLE = False @@ -152,10 +152,6 @@ class Tapdisk: # "B" path. Note: "A", "B" and "OLD_A" are UUIDs. session = self.session - linstor_uri = 'linstor://{}'.format( - util.get_master_rec(session)['address'] - ) - host_ref = util.get_this_host_ref(session) sr_ref = session.xenapi.SR.get_by_uuid(self.sr_uuid) @@ -167,7 +163,7 @@ class Tapdisk: group_name = dconf['group-name'] device_path = LinstorVolumeManager( - linstor_uri, + get_controller_uri(), group_name, logger=util.SMlog ).get_device_path(self.vdi_uuid) diff --git a/drivers/util.py b/drivers/util.py index f4f62525..2edd888e 100755 --- a/drivers/util.py +++ b/drivers/util.py @@ -657,31 +657,10 @@ def get_master_ref(session): return session.xenapi.pool.get_master(pools[0]) -def get_master_rec(session): - return session.xenapi.host.get_record(get_master_ref(session)) - - def is_master(session): return get_this_host_ref(session) == get_master_ref(session) -def get_master_address(): - address = None - try: - fd = open('/etc/xensource/pool.conf', 'r') - try: - items = fd.readline().split(':') - if items[0].strip() == 'master': - address = 'localhost' - else: - address = items[1].strip() - finally: - fd.close() - except Exception: - pass - return address - - # XXX: this function doesn't do what it claims to do def get_localhost_uuid(session): filename = '/etc/xensource-inventory' diff --git a/etc/minidrbdcluster.ini b/etc/minidrbdcluster.ini new file mode 100644 index 00000000..0126e862 --- /dev/null +++ b/etc/minidrbdcluster.ini @@ -0,0 +1,14 @@ +# minidrbdcluster keeps a service running on one of the nodes. +# Quorum must be enabled in the DRBD resource! +# +# The section names are the names of DRBD resources. Within a +# section name the systemd-units to activate on one of the nodes. + +[xcp-persistent-database] +systemd-units=var-lib-linstor.mount,linstor-controller.service + +[xcp-persistent-ha-statefile] +systemd-units= + +[xcp-persistent-redo-log] +systemd-units= diff --git a/etc/systemd/system/linstor-satellite.service.d/override.conf b/etc/systemd/system/linstor-satellite.service.d/override.conf new file mode 100644 index 00000000..b1686b4f --- /dev/null +++ b/etc/systemd/system/linstor-satellite.service.d/override.conf @@ -0,0 +1,5 @@ +[Service] +Environment=LS_KEEP_RES=^xcp-persistent* + +[Unit] +After=drbd.service diff --git a/etc/systemd/system/var-lib-linstor.mount b/etc/systemd/system/var-lib-linstor.mount new file mode 100644 index 00000000..a05a7f74 --- /dev/null +++ b/etc/systemd/system/var-lib-linstor.mount @@ -0,0 +1,6 @@ +[Unit] +Description=Filesystem for the LINSTOR controller + +[Mount] +What=/dev/drbd/by-res/xcp-persistent-database/0 +Where=/var/lib/linstor diff --git a/linstor/linstor-monitord.c b/linstor/linstor-monitord.c index a1592fda..47740598 100644 --- a/linstor/linstor-monitord.c +++ b/linstor/linstor-monitord.c @@ -287,18 +287,6 @@ static inline int addInotifyWatch (int inotifyFd, const char *filepath, uint32_t // ----------------------------------------------------------------------------- -static inline int updateLinstorController (int isMaster) { - syslog(LOG_INFO, "%s linstor-controller...", isMaster ? "Enabling" : "Disabling"); - char *argv[] = { - "systemctl", - isMaster ? "enable" : "disable", - "--now", - "linstor-controller", - NULL - }; - return execCommand(argv, NULL); -} - static inline int updateLinstorNode (State *state) { char buffer[256]; if (gethostname(buffer, sizeof buffer) == -1) { @@ -416,7 +404,6 @@ static inline int processPoolConfEvents (State *state, int wd, char **buffer, si inotify_rm_watch(state->inotifyFd, wd); // Do not forget to remove watch to avoid leaks. return -EIO; } - ret = updateLinstorController(state->isMaster); } else { if (mask & (IN_CREATE | IN_MOVED_TO)) { syslog(LOG_ERR, "Watched `" POOL_CONF_ABS_FILE "` file has been recreated!"); @@ -495,8 +482,6 @@ static inline int waitForPoolConfCreation (State *state, int *wdFile) { // Update LINSTOR services... int ret; state->isMaster = isMasterHost(&ret); - if (!ret) - ret = updateLinstorController(state->isMaster); // Ok we can't read the pool configuration file. // Maybe the file doesn't exist. Waiting its creation... diff --git a/scripts/minidrbdcluster b/scripts/minidrbdcluster new file mode 100755 index 00000000..a04b6c1c --- /dev/null +++ b/scripts/minidrbdcluster @@ -0,0 +1,171 @@ +#! /usr/bin/env python2 + +import configparser +import os +import re +import signal +import subprocess + +DRBDADM_OPEN_FAILED_RE = re.compile( + 'open\\((.*)\\) failed: No such file or directory' +) +MAY_PROMOT_RE = re.compile( + '(?:exists|change) resource name:((?:\\w|-)+) ' + '(?:\\w+\\:\\w+ )*may_promote:(yes|no) promotion_score:(\\d+)' +) +PEER_ROLE_RE = re.compile( + '(?:exists|change) connection name:((?:\\w|-)+) peer-node-id:(?:\\d+) ' + 'conn-name:(\\w+) (?:\\w+\\:\\w+ )*role:(Primary|Secondary|Unknown)' +) +HAVE_QUORUM_RE = re.compile( + '(?:exists|change) device name:((?:\\w|-)+) ' + '(?:\\w+\\:\\w+ )*quorum:(yes|no)' +) + + +class SigHupException(Exception): + pass + + +def sig_handler(sig, frame): + raise SigHupException( + 'Received signal ' + str(sig) + + ' on line ' + str(frame.f_lineno) + + ' in ' + frame.f_code.co_filename + ) + + +def call_systemd(operation, service): + verbose = operation in ('start', 'stop') + if verbose: + print('Trying to %s %s' % (operation, service)) + r = os.system('systemctl %s %s' % (operation, service)) + if verbose: + print('%s for %s %s' % ( + 'success' if r == 0 else 'failure', operation, service + )) + return r == 0 + + +def ensure_systemd_started(service): + args = ['systemctl', 'is-active', '--quiet', service] + + proc = subprocess.Popen(args) + proc.wait() + if not proc.returncode: + return True # Already active. + + return call_systemd('start', service) + + +def show_status(services, status): + print('status:') + for systemd_unit in services: + call_systemd('status', systemd_unit) + for res_name in status: + print('%s is %s' % (res_name, status[res_name])) + + +def clean_up(services): + print('exiting:') + for systemd_unit in reversed(services): + call_systemd('stop', systemd_unit) + + +def get_systemd_units(systemd_units_str): + systemd_units = [] + for systemd_unit in systemd_units_str.split(','): + systemd_unit = systemd_unit.strip() + if systemd_unit: + systemd_units.append(systemd_unit) + return systemd_units + + +def process(events2, resources, services, status): + line = events2.stdout.readline() + m = MAY_PROMOT_RE.match(line) + if m: + res_name, may_promote, promotion_score = m.groups() + if res_name in resources and may_promote == 'yes': + systemd_units_str = resources[res_name]['systemd-units'] + for systemd_unit in get_systemd_units(systemd_units_str): + if not ensure_systemd_started(systemd_unit): + break + if systemd_unit not in services: + services.append(systemd_unit) + m = PEER_ROLE_RE.match(line) + if m: + res_name, conn_name, role = m.groups() + if res_name in status: + status[res_name][conn_name] = role + m = HAVE_QUORUM_RE.match(line) + if m: + res_name, have_quorum = m.groups() + if res_name in resources and have_quorum == 'no': + systemd_units_str = resources[res_name]['systemd-units'] + systemd_units = get_systemd_units(systemd_units_str) + to_stop = [x for x in systemd_units if x in services] + if to_stop: + print('Lost quorum on %s' % (res_name)) + for systemd_unit in reversed(to_stop): + r = call_systemd('stop', systemd_unit) + if r: + services.remove(systemd_unit) + + +def active_drbd_volume(res_name): + retry = True + args = ['drbdadm', 'adjust', res_name] + while True: + proc = subprocess.Popen(args, stderr=subprocess.PIPE) + (stdout, stderr) = proc.communicate() + if not proc.returncode: + return # Success. \o/ + + if not retry: + break + + m = DRBDADM_OPEN_FAILED_RE.match(stderr) + if m and subprocess.call(['lvchange', '-ay', m.groups()[0]]) == 0: + retry = False + else: + break + + print('Failed to execute `{}`: {}'.format(args, stderr)) + + +def main(): + services = [] + status = dict() + config = configparser.ConfigParser() + config.read('/etc/minidrbdcluster.ini') + resources = config._sections + if not resources: + raise Exception( + 'No resources to watch, maybe /etc/minidrbdcluster.ini missing' + ) + print('Managing DRBD resources: %s' % (' '.join(resources))) + for res_name in resources: + status[res_name] = dict() + active_drbd_volume(res_name) + + signal.signal(signal.SIGHUP, sig_handler) + + print('Starting process...') + events2 = subprocess.Popen( + ['drbdsetup', 'events2'], stdout=subprocess.PIPE + ) + run = True + while run: + try: + process(events2, resources, services, status) + except KeyboardInterrupt: + run = False + except SigHupException: + show_status(services, status) + + clean_up(services) + + +if __name__ == '__main__': + main() diff --git a/systemd/minidrbdcluster.service b/systemd/minidrbdcluster.service new file mode 100644 index 00000000..3de6ac4f --- /dev/null +++ b/systemd/minidrbdcluster.service @@ -0,0 +1,18 @@ +[Unit] +Description=Minimalistic high-availability cluster resource manager +Before=xs-sm.service +Wants=network-online.target +After=network-online.target + +[Service] +Type=simple +Environment=PYTHONUNBUFFERED=1 +ExecStart=/opt/xensource/libexec/minidrbdcluster +KillMode=process +KillSignal=SIGINT +StandardOutput=journal +StandardError=journal +SyslogIdentifier=minidrbdcluster + +[Install] +WantedBy=multi-user.target From 0d512e79def0e4f5feff806983ea4f12c6c6b201 Mon Sep 17 00:00:00 2001 From: Wescoeur Date: Wed, 24 Feb 2021 11:17:23 +0100 Subject: [PATCH 029/133] feat(LinstorSR): ensure heartbeat and redo_log VDIs are not diskless Signed-off-by: Ronan Abhamon --- drivers/LinstorSR.py | 2 +- drivers/linstorvolumemanager.py | 97 +++++++++++++++++++++++++++++---- 2 files changed, 86 insertions(+), 13 deletions(-) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index 9650d712..d943d49e 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -1463,7 +1463,7 @@ def create(self, sr_uuid, vdi_uuid, size): self._linstor.create_volume( self.uuid, volume_size, persistent=False, - volume_name=volume_name + volume_name=volume_name, no_diskless=(volume_name is not None) ) volume_info = self._linstor.get_volume_info(self.uuid) diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py index a383e327..d8d64b4a 100755 --- a/drivers/linstorvolumemanager.py +++ b/drivers/linstorvolumemanager.py @@ -468,7 +468,8 @@ def check_volume_exists(self, volume_uuid): return volume_uuid in self._volumes def create_volume( - self, volume_uuid, size, persistent=True, volume_name=None + self, volume_uuid, size, persistent=True, volume_name=None, + no_diskless=False ): """ Create a new volume on the SR. @@ -478,6 +479,8 @@ def create_volume( on the next constructor call LinstorSR(...). :param str volume_name: If set, this name is used in the LINSTOR database instead of a generated name. + :param bool no_diskless: If set, the default group redundancy is not + used, instead the volume is created on all nodes. :return: The current device path of the volume. :rtype: str """ @@ -486,7 +489,8 @@ def create_volume( if not volume_name: volume_name = self.build_volume_name(util.gen_uuid()) volume_properties = self._create_volume_with_properties( - volume_uuid, volume_name, size, place_resources=True + volume_uuid, volume_name, size, place_resources=True, + no_diskless=no_diskless ) try: @@ -1673,19 +1677,88 @@ def _get_storage_pools(self, force=False): return self._storage_pools - def _create_volume(self, volume_uuid, volume_name, size, place_resources): + def _create_volume( + self, volume_uuid, volume_name, size, place_resources, + no_diskless=False + ): size = self.round_up_volume_size(size) - self._mark_resource_cache_as_dirty() - self._check_volume_creation_errors(self._linstor.resource_group_spawn( - rsc_grp_name=self._group_name, - rsc_dfn_name=volume_name, - vlm_sizes=['{}B'.format(size)], - definitions_only=not place_resources - ), volume_uuid, self._group_name) + + # A. Basic case when we use the default redundancy of the group. + if not no_diskless: + self._check_volume_creation_errors( + self._linstor.resource_group_spawn( + rsc_grp_name=self._group_name, + rsc_dfn_name=volume_name, + vlm_sizes=['{}B'.format(size)], + definitions_only=not place_resources + ), + volume_uuid, + self._group_name + ) + return + + # B. Complex case. + if not place_resources: + raise LinstorVolumeManagerError( + 'Could not create volume `{}` from SR `{}`: it\'s impossible ' + .format(volume_uuid, self._group_name) + + 'to force no diskless without placing resources' + ) + + # B.1. Create resource list. + resources = [] + for node_name in self._get_node_names(): + resources.append(linstor.ResourceData( + node_name=node_name, + rsc_name=volume_name, + storage_pool=self._group_name + )) + + # B.2. Create volume! + def clean(): + try: + self._destroy_volume(volume_uuid) + except Exception as e: + self._logger( + 'Unable to destroy volume {} after creation fail: {}' + .format(volume_uuid, e) + ) + + def create(): + try: + self._check_volume_creation_errors( + self._linstor.resource_group_spawn( + rsc_grp_name=self._group_name, + rsc_dfn_name=volume_name, + vlm_sizes=['{}B'.format(size)], + definitions_only=True + ), + volume_uuid, + self._group_name + ) + + result = self._linstor.resource_create(resources) + error_str = self._get_error_str(result) + if error_str: + raise LinstorVolumeManagerError( + 'Could not create volume `{}` from SR `{}`: {}'.format( + volume_uuid, self._group_name, error_str + ) + ) + except LinstorVolumeManagerError as e: + if e.code != LinstorVolumeManagerError.ERR_VOLUME_EXISTS: + clean() + raise + except Exception: + clean() + raise + + util.retry(create, maxretry=5) def _create_volume_with_properties( - self, volume_uuid, volume_name, size, place_resources + self, volume_uuid, volume_name, size, place_resources, + no_diskless=False ): if self.check_volume_exists(volume_uuid): raise LinstorVolumeManagerError( @@ -1714,7 +1787,7 @@ def _create_volume_with_properties( volume_properties[self.PROP_VOLUME_NAME] = volume_name self._create_volume( - volume_uuid, volume_name, size, place_resources + volume_uuid, volume_name, size, place_resources, no_diskless ) assert volume_properties.namespace == \ From 4621245da3c5805b40e943b9b344ae991e255cfe Mon Sep 17 00:00:00 2001 From: Wescoeur Date: Thu, 25 Feb 2021 17:52:57 +0100 Subject: [PATCH 030/133] feat(LinstorSR): protect sr commands to avoid forgetting LINSTOR volumes when master satellite is down Steps to reproduce: - Ensure the linstor satellite is not running on the master host, otherwise stop it - Then restart the controller on the right host where the LINSTOR database is mounted - Run st_attach command => All volumes will be forgotten To avoid this, it's possible to restart the satellite on the master before the sr_attach command. Also it's funny to see you can start and stop the satellite juste before the sr_attach, and the volumes will not be removed. Explanations: In theory this bug is impossible because during the sr_attach execution, an exception is thrown (so sr_scan should not be executed) BUT there is a piece of code that is executed in SRCommand.py when sr_attach is called: ```python try: return sr.attach(sr_uuid) finally: if is_master: sr.after_master_attach(sr_uuid) ``` The exception is not immediately forwarded because a finally block must be executed before. And what is the implementation of after_master_attach? ```python def after_master_attach(self, uuid): """Perform actions required after attaching on the pool master Return: None """ self.scan(uuid) ``` Oh! Of course, a scan is always executed after a attach... What's the purpose of a scan if we can't execute correctly an attach command before? I don't know, but it's probably error-prone like this context. When scan is called, we suppose the SR is attached and we have all VDIs loaded but it's not the case because an exception has been thrown. To solve this problem we forbid the execution of the scan if the attach failed. Signed-off-by: Ronan Abhamon --- drivers/LinstorSR.py | 47 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 10 deletions(-) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index d943d49e..092f5e8e 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -256,6 +256,11 @@ class LinstorSR(SR.SR): MANAGER_PLUGIN = 'linstor-manager' + INIT_STATUS_NOT_SET = 0 + INIT_STATUS_IN_PROGRESS = 1 + INIT_STATUS_OK = 2 + INIT_STATUS_FAIL = 3 + # -------------------------------------------------------------------------- # SR methods. # -------------------------------------------------------------------------- @@ -325,19 +330,18 @@ def load(self, sr_uuid): self._vdi_shared_time = 0 - self._initialized = False + self._init_status = self.INIT_STATUS_NOT_SET self._vdis_loaded = False self._all_volume_info_cache = None self._all_volume_metadata_cache = None def _locked_load(method): - @functools.wraps(method) - def wrap(self, *args, **kwargs): - if self._initialized: - return method(self, *args, **kwargs) - self._initialized = True + def wrapped_method(self, *args, **kwargs): + self._init_status = self.INIT_STATUS_OK + return method(self, *args, **kwargs) + def load(self, *args, **kwargs): if not self._has_session: if self.srcmd.cmd == 'vdi_attach_from_config': # We must have a valid LINSTOR instance here without using @@ -352,7 +356,7 @@ def wrap(self, *args, **kwargs): self._group_name, logger=util.SMlog ) - return method(self, *args, **kwargs) + return wrapped_method(self, *args, **kwargs) if not self._is_master: if self.cmd in [ @@ -456,11 +460,29 @@ def wrap(self, *args, **kwargs): ) util.SMlog(traceback.format_exc()) - return method(self, *args, **kwargs) + return wrapped_method(self, *args, **kwargs) + + @functools.wraps(wrapped_method) + def wrap(self, *args, **kwargs): + if self._init_status in \ + (self.INIT_STATUS_OK, self.INIT_STATUS_IN_PROGRESS): + return wrapped_method(self, *args, **kwargs) + if self._init_status == self.INIT_STATUS_FAIL: + util.SMlog( + 'Can\'t call method {} because initialization failed' + .format(method) + ) + else: + try: + self._init_status = self.INIT_STATUS_IN_PROGRESS + return load(self, *args, **kwargs) + except Exception: + if self._init_status != self.INIT_STATUS_OK: + self._init_status = self.INIT_STATUS_FAIL + raise return wrap - @_locked_load def cleanup(self): if self._vdi_shared_time: self._shared_lock_vdi(self.srcmd.params['vdi_uuid'], locked=False) @@ -657,6 +679,9 @@ def probe(self): @_locked_load def scan(self, uuid): + if self._init_status == self.INIT_STATUS_FAIL: + return + util.SMlog('LinstorSR.scan for {}'.format(self.uuid)) if not self._linstor: raise xs_errors.XenError( @@ -855,7 +880,6 @@ def _update_physical_size(self): def _load_vdis(self): if self._vdis_loaded: return - self._vdis_loaded = True assert self._is_master @@ -866,6 +890,9 @@ def _load_vdis(self): self._load_vdis_ex() self._destroy_linstor_cache() + # We must mark VDIs as loaded only if the load is a success. + self._vdis_loaded = True + self._undo_all_journal_transactions() def _load_vdis_ex(self): From cbb08f7a00698e746b96a06930bf7140e637d417 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Mon, 8 Mar 2021 13:25:28 +0100 Subject: [PATCH 031/133] fix(LinstorJournaler): ensure uri is not None during linstor.KV creation Signed-off-by: Ronan Abhamon --- drivers/linstorjournaler.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/linstorjournaler.py b/drivers/linstorjournaler.py index 285012ca..3993f601 100755 --- a/drivers/linstorjournaler.py +++ b/drivers/linstorjournaler.py @@ -16,7 +16,8 @@ # -from linstorvolumemanager import get_controller_uri, LinstorVolumeManager +from linstorvolumemanager import \ + get_controller_uri, LinstorVolumeManager, LinstorVolumeManagerError import linstor import re import util @@ -145,6 +146,10 @@ def _create_journal_instance(cls, uri, group_name, namespace): def connect(uri): if not uri: uri = get_controller_uri() + if not uri: + raise LinstorVolumeManagerError( + 'Unable to find controller uri...' + ) return linstor.KV( LinstorVolumeManager._build_group_name(group_name), uri=uri, @@ -153,13 +158,15 @@ def connect(uri): try: return connect(uri) - except linstor.errors.LinstorNetworkError: + except (linstor.errors.LinstorNetworkError, LinstorVolumeManagerError): pass return util.retry( lambda: connect(None), maxretry=10, - exceptions=[linstor.errors.LinstorNetworkError] + exceptions=[ + linstor.errors.LinstorNetworkError, LinstorVolumeManagerError + ] ) @staticmethod From 5bd2d5ede6c0db235f6b7ee77ac365e062db241b Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Mon, 22 Mar 2021 17:32:26 +0100 Subject: [PATCH 032/133] feat(LinstorSR): add an option to disable auto-quorum on volume DB + fix doc Signed-off-by: Ronan Abhamon --- drivers/LinstorSR.py | 9 ++++++++- drivers/linstorvolumemanager.py | 36 +++++++++++++++++++-------------- 2 files changed, 29 insertions(+), 16 deletions(-) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index 092f5e8e..9f2be58c 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -31,6 +31,7 @@ from lock import Lock import blktap2 import cleanup +import distutils import errno import functools import scsiutil @@ -77,7 +78,8 @@ ['group-name', 'LVM group name'], ['hosts', 'host names to use'], ['redundancy', 'replication count'], - ['provisioning', '"thin" or "thick" are accepted'] + ['provisioning', '"thin" or "thick" are accepted (optional, defaults to thin)'], + ['monitor-db-quorum', 'disable controller when only one host is online (optional, defaults to true)'] ] DRIVER_INFO = { @@ -300,6 +302,10 @@ def load(self, sr_uuid): else: self._provisioning = self.PROVISIONING_DEFAULT + monitor_db_quorum = self.dconf.get('monitor-db-quorum') + self._monitor_db_quorum = (monitor_db_quorum is None) or \ + distutils.util.strtobool(monitor_db_quorum) + # Note: We don't have access to the session field if the # 'vdi_attach_from_config' command is executed. self._has_session = self.sr_ref and self.session is not None @@ -553,6 +559,7 @@ def create(self, uuid, size): ips, self._redundancy, thin_provisioning=self._provisioning == 'thin', + auto_quorum=self._monitor_db_quorum, logger=util.SMlog ) self._vhdutil = LinstorVhdUtil(self.session, self._linstor) diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py index d8d64b4a..27c8df5d 100755 --- a/drivers/linstorvolumemanager.py +++ b/drivers/linstorvolumemanager.py @@ -1261,15 +1261,17 @@ def invalidate_resource_cache(self): @classmethod def create_sr( cls, group_name, node_names, ips, redundancy, - thin_provisioning=False, + thin_provisioning, auto_quorum, logger=default_logger.__func__ ): """ Create a new SR on the given nodes. :param str group_name: The SR group_name to use. :param list[str] node_names: String list of nodes. + :param set(str) ips: Node ips. :param int redundancy: How many copy of volumes should we store? - :param set(str) ips: Node ips + :param bool thin_provisioning: Use thin or thick provisioning. + :param bool auto_quorum: DB quorum is monitored by LINSTOR. :param function logger: Function to log messages. :return: A new LinstorSr instance. :rtype: LinstorSr @@ -1283,6 +1285,7 @@ def create_sr( ips, redundancy, thin_provisioning, + auto_quorum, logger ) finally: @@ -1300,7 +1303,7 @@ def create_sr( @classmethod def _create_sr( cls, group_name, node_names, ips, redundancy, - thin_provisioning=False, + thin_provisioning, auto_quorum, logger=default_logger.__func__ ): # 1. Check if SR already exists. @@ -1406,7 +1409,9 @@ def _create_sr( # 3. Create the LINSTOR database volume and mount it. try: logger('Creating database volume...') - volume_path = cls._create_database_volume(lin, group_name) + volume_path = cls._create_database_volume( + lin, group_name, auto_quorum + ) except LinstorVolumeManagerError as e: if e.code != LinstorVolumeManagerError.ERR_VOLUME_EXISTS: logger('Destroying database volume after creation fail...') @@ -2113,7 +2118,7 @@ def _request_database_path(cls, lin, activate=False): return resources[0].volumes[0].device_path @classmethod - def _create_database_volume(cls, lin, group_name): + def _create_database_volume(cls, lin, group_name, auto_quorum): try: dfns = lin.resource_dfn_list_raise().resource_definitions except Exception as e: @@ -2139,16 +2144,17 @@ def _create_database_volume(cls, lin, group_name): # We must modify the quorum. Otherwise we can't use correctly the # minidrbdcluster daemon. - result = lin.resource_dfn_modify(DATABASE_VOLUME_NAME, { - 'DrbdOptions/auto-quorum': 'disabled', - 'DrbdOptions/Resource/quorum': 'majority' - }) - error_str = cls._get_error_str(result) - if error_str: - raise LinstorVolumeManagerError( - 'Could not activate quorum on database volume: {}' - .format(error_str) - ) + if auto_quorum: + result = lin.resource_dfn_modify(DATABASE_VOLUME_NAME, { + 'DrbdOptions/auto-quorum': 'disabled', + 'DrbdOptions/Resource/quorum': 'majority' + }) + error_str = cls._get_error_str(result) + if error_str: + raise LinstorVolumeManagerError( + 'Could not activate quorum on database volume: {}' + .format(error_str) + ) current_device_path = cls._request_database_path(lin, activate=True) From 8f1951343cd6680346c307c7a2b838991f852ff8 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Tue, 23 Mar 2021 14:49:39 +0100 Subject: [PATCH 033/133] fix(LinstorVolumeManager): add a workaround to create properly SR with thin LVM Signed-off-by: Ronan Abhamon --- drivers/linstorvolumemanager.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py index 27c8df5d..3aaffdf4 100755 --- a/drivers/linstorvolumemanager.py +++ b/drivers/linstorvolumemanager.py @@ -2134,6 +2134,17 @@ def _create_database_volume(cls, lin, group_name, auto_quorum): ) + 'LINSTOR volume list must be empty.' ) + # Workaround to use thin lvm. Without this line an error is returned: + # "Not enough available nodes" + # I don't understand why but this command protect against this bug. + try: + lin.storage_pool_list_raise(filter_by_stor_pools=[group_name]) + except Exception as e: + raise LinstorVolumeManagerError( + 'Failed to get storage pool list before database creation: {}' + .format(e) + ) + size = cls.round_up_volume_size(DATABASE_SIZE) cls._check_volume_creation_errors(lin.resource_group_spawn( rsc_grp_name=group_name, From 1bbc53a2b6d6e5f5d837f850de2a1927943957e0 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Wed, 24 Mar 2021 10:06:58 +0100 Subject: [PATCH 034/133] feat(LinstorSR): add optional ips parameter Signed-off-by: Ronan Abhamon --- drivers/LinstorSR.py | 30 ++++++++++++++++++++----- drivers/linstorvolumemanager.py | 40 ++++++++++++++++++++++----------- 2 files changed, 52 insertions(+), 18 deletions(-) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index 9f2be58c..4b761b56 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -77,6 +77,7 @@ CONFIGURATION = [ ['group-name', 'LVM group name'], ['hosts', 'host names to use'], + ['ips', 'ips to use (optional, defaults to management networks)'], ['redundancy', 'replication count'], ['provisioning', '"thin" or "thick" are accepted (optional, defaults to thin)'], ['monitor-db-quorum', 'disable controller when only one host is online (optional, defaults to true)'] @@ -325,6 +326,10 @@ def load(self, sr_uuid): self.sr_vditype = SR.DEFAULT_TAP self._hosts = list(set(self.dconf['hosts'].split(','))) + if 'ips' not in self.dconf or not self.dconf['ips']: + self._ips = None + else: + self._ips = self.dconf['ips'].split(',') self._redundancy = int(self.dconf['redundancy'] or 1) self._linstor = None # Ensure that LINSTOR attribute exists. self._journaler = None @@ -533,11 +538,26 @@ def create(self, uuid, size): ) ips = {} - for host in online_hosts: - record = self.session.xenapi.host.get_record(host) - hostname = record['hostname'] - if hostname in self._hosts: - ips[hostname] = record['address'] + if not self._ips: + for host in online_hosts: + record = self.session.xenapi.host.get_record(host) + hostname = record['hostname'] + if hostname in self._hosts: + ips[hostname] = record['address'] + elif len(self._ips) != len(self._hosts): + raise xs_errors.XenError( + 'LinstorSRCreate', + opterr='ips must be equal to host count' + ) + else: + for host in online_hosts: + record = self.session.xenapi.host.get_record(host) + hostname = record['hostname'] + try: + index = self._hosts.index(hostname) + ips[hostname] = self._ips[index] + except ValueError as e: + pass if len(ips) != len(self._hosts): raise xs_errors.XenError( diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py index 3aaffdf4..5c04d028 100755 --- a/drivers/linstorvolumemanager.py +++ b/drivers/linstorvolumemanager.py @@ -1313,22 +1313,36 @@ def _create_sr( for node_name in node_names: ip = ips[node_name] - result = lin.node_create( - node_name, - linstor.consts.VAL_NODE_TYPE_CMBD, - ip - ) - errors = cls._filter_errors(result) - if cls._check_errors(errors, [linstor.consts.FAIL_EXISTS_NODE]): - continue - if errors: - raise LinstorVolumeManagerError( - 'Failed to create node `{}` with ip `{}`: {}'.format( - node_name, ip, cls._get_error_str(errors) - ) + while True: + # Try to create node. + result = lin.node_create( + node_name, + linstor.consts.VAL_NODE_TYPE_CMBD, + ip ) + errors = cls._filter_errors(result) + if cls._check_errors( + errors, [linstor.consts.FAIL_EXISTS_NODE] + ): + # If it already exists, remove, then recreate. + result = lin.node_delete(node_name) + error_str = cls._get_error_str(result) + if error_str: + raise LinstorVolumeManagerError( + 'Failed to remove old node `{}`: {}' + .format(node_name, error_str) + ) + elif not errors: + break # Created! + else: + raise LinstorVolumeManagerError( + 'Failed to create node `{}` with ip `{}`: {}'.format( + node_name, ip, cls._get_error_str(errors) + ) + ) + driver_pool_name = group_name group_name = cls._build_group_name(group_name) pools = lin.storage_pool_list_raise(filter_by_stor_pools=[group_name]) From 918eaee4a1343026883c8c8ec9714e0bf0350d03 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Fri, 26 Mar 2021 16:13:20 +0100 Subject: [PATCH 035/133] feat(LinstorSR): add a helper `log_drbd_erofs` to trace EROFS errno code with DRBD resources + check EROFS error Signed-off-by: Ronan Abhamon --- drivers/blktap2.py | 18 ++++++- drivers/linstor-manager | 19 +++++++- drivers/linstorvolumemanager.py | 83 ++++++++++++++++++++++++++++++++- drivers/vhdutil.py | 2 +- 4 files changed, 118 insertions(+), 4 deletions(-) diff --git a/drivers/blktap2.py b/drivers/blktap2.py index e1f75e9f..21f5dfc5 100755 --- a/drivers/blktap2.py +++ b/drivers/blktap2.py @@ -36,6 +36,7 @@ import xs_errors import XenAPI import scsiutil +from linstorvolumemanager import log_lsof_drbd from syslog import openlog, syslog from stat import * # S_ISBLK(), ... import nfs @@ -817,7 +818,22 @@ def launch_on_tap(cls, blktap, path, _type, options): TapCtl.attach(pid, minor) try: - TapCtl.open(pid, minor, _type, path, options) + retry_open = 0 + while True: + try: + TapCtl.open(pid, minor, _type, path, options) + except TapCtl.CommandFailure as e: + err = ( + 'status' in e.info and e.info['status'] + ) or None + if err in (errno.EIO, errno.EROFS, errno.EAGAIN): + if retry_open < 5: + retry_open += 1 + time.sleep(1) + continue + if err == errno.EROFS: + log_lsof_drbd(path) + break try: tapdisk = cls.__from_blktap(blktap) node = '/sys/dev/block/%d:%d' % (tapdisk.major(), tapdisk.minor) diff --git a/drivers/linstor-manager b/drivers/linstor-manager index f82b73f2..a06ed201 100755 --- a/drivers/linstor-manager +++ b/drivers/linstor-manager @@ -327,6 +327,22 @@ def lock_vdi(session, args): return str(False) +def lsof_resource(session, args): + try: + drbd_path = args['drbdPath'] + (ret, stdout, stderr) = util.doexec(['lsof', drbd_path]) + if ret == 0: + return 'DRBD resource `{}` is open: {}'.format( + drbd_path, stdout + ) + return '`lsof` on DRBD resource `{}` returned {}: {}'.format( + drbd_path, ret, stderr + ) + except Exception as e: + util.SMlog('linstor-manager:lsof_drbd error: {}'.format(e)) + raise + + if __name__ == '__main__': XenAPIPlugin.dispatch({ 'prepareSr': prepare_sr, @@ -344,5 +360,6 @@ if __name__ == '__main__': 'getDepth': get_depth, 'getKeyHash': get_key_hash, 'getBlockBitmap': get_block_bitmap, - 'lockVdi': lock_vdi + 'lockVdi': lock_vdi, + 'lsofResource': lsof_resource }) diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py index 5c04d028..0357b92d 100755 --- a/drivers/linstorvolumemanager.py +++ b/drivers/linstorvolumemanager.py @@ -16,6 +16,7 @@ # +import errno import glob import json import linstor @@ -23,6 +24,7 @@ import re import shutil import socket +import stat import time import util import uuid @@ -37,6 +39,85 @@ REG_DRBDADM_PRIMARY = re.compile("([^\\s]+)\\s+role:Primary") REG_DRBDSETUP_IP = re.compile('[^\\s]+\\s+(.*):.*$') +DRBD_BY_RES_PATH = '/dev/drbd/by-res/' + + +# Check if a path is a DRBD resource and log the process name/pid +# that opened it. +def log_lsof_drbd(path): + PLUGIN = 'linstor-manager' + PLUGIN_CMD = 'lsofResource' + + # Ignore if it's not a symlink to DRBD resource. + if not path.startswith(DRBD_BY_RES_PATH): + return + + # Compute resource name. + res_name_end = path.find('/', len(DRBD_BY_RES_PATH)) + if res_name_end == -1: + return + res_name = path[len(DRBD_BY_RES_PATH):res_name_end] + + try: + # Ensure path is a DRBD. + drbd_path = os.path.realpath(path) + stats = os.stat(drbd_path) + if not stat.S_ISBLK(stats.st_mode) or os.major(stats.st_rdev) != 147: + return + + # Find where the device is open. + (ret, stdout, stderr) = util.doexec(['drbdadm', 'status', res_name]) + if ret != 0: + util.SMlog('Failed to execute `drbdadm status` on `{}`: {}'.format( + res_name, stderr + )) + return + + # Is it a local device? + if stdout.startswith('{} role:Primary'.format(res_name)): + (ret, stdout, stderr) = util.doexec(['lsof', drbd_path]) + if ret == 0: + util.SMlog( + 'DRBD resource `{}` is open on local host: {}' + .format(path, stdout) + ) + else: + util.SMlog( + '`lsof` on local DRBD resource `{}` returned {}: {}' + .format(path, ret, stderr) + ) + return + + # Is it a remote device? + res = REG_DRBDADM_PRIMARY.search(stdout) + if not res: + util.SMlog( + 'Cannot find where is open DRBD resource `{}`' + .format(path) + ) + return + node_name = res.groups()[0] + + session = util.get_localAPI_session() + hosts = session.xenapi.host.get_all_records() + for host_ref, host_record in hosts.items(): + if node_name != host_record['hostname']: + continue + + ret = session.xenapi.host.call_plugin( + host_ref, PLUGIN, PLUGIN_CMD, {'drbdPath': drbd_path}, + ) + util.SMlog('DRBD resource `{}` status on host `{}`: {}'.format( + path, host_ref, ret + )) + return + util.SMlog('Cannot find primary host of DRBD resource {}'.format(path)) + except Exception as e: + util.SMlog( + 'Got exception while trying to determine where DRBD resource ' + + '`{}` is open: {}'.format(path, e) + ) + # ============================================================================== @@ -162,7 +243,7 @@ class LinstorVolumeManager(object): '_kv_cache_dirty', '_resource_cache_dirty', '_volume_info_cache_dirty' ) - DEV_ROOT_PATH = '/dev/drbd/by-res/' + DEV_ROOT_PATH = DRBD_BY_RES_PATH # Default LVM extent size. BLOCK_SIZE = 4 * 1024 * 1024 diff --git a/drivers/vhdutil.py b/drivers/vhdutil.py index 422834eb..0a8fe918 100755 --- a/drivers/vhdutil.py +++ b/drivers/vhdutil.py @@ -99,7 +99,7 @@ def fullSizeVHD(virtual_size): def ioretry(cmd): return util.ioretry(lambda: util.pread2(cmd), - errlist = [errno.EIO, errno.EAGAIN]) + errlist = [errno.EIO, errno.EROFS, errno.EAGAIN]) def getVHDInfo(path, extractUuidFunction, includeParent = True): """Get the VHD info. The parent info may optionally be omitted: vhd-util From 2d272e23a2536f14ca7c0952e635b352589acffd Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Wed, 28 Apr 2021 15:15:58 +0200 Subject: [PATCH 036/133] fix(LinstorSR): try to restart the services again if there is a failure in linstor-manager Signed-off-by: Ronan Abhamon --- drivers/linstor-manager | 39 ++++++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/drivers/linstor-manager b/drivers/linstor-manager index a06ed201..dcd4bc6f 100755 --- a/drivers/linstor-manager +++ b/drivers/linstor-manager @@ -16,6 +16,7 @@ import base64 import distutils.util +import time import subprocess import sys import XenAPIPlugin @@ -52,20 +53,36 @@ def update_all_ports(open): def enable_and_start_service(name, start): - fn = 'enable' if start else 'disable' - args = ('systemctl', fn, '--now', name) - (ret, out, err) = util.doexec(args) - if ret == 0: - return - raise Exception('Failed to {} {}: {} {}'.format(fn, name, out, err)) + attempt = 0 + while True: + attempt += 1 + fn = 'enable' if start else 'disable' + args = ('systemctl', fn, '--now', name) + (ret, out, err) = util.doexec(args) + if ret == 0: + return + elif attempt >= 3: + raise Exception( + 'Failed to {} {}: {} {}'.format(fn, name, out, err) + ) + time.sleep(1) def restart_service(name): - args = ('systemctl', 'restart', name) - (ret, out, err) = util.doexec(args) - if ret == 0: - return - raise Exception('Failed to restart {}: {} {}'.format(name, out, err)) + attempt = 0 + while True: + attempt += 1 + util.SMlog('linstor-manager:restart service {} {}...'.format(name, attempt)) + args = ('systemctl', 'restart', name) + (ret, out, err) = util.doexec(args) + if ret == 0: + return + elif attempt >= 3: + util.SMlog('linstor-manager:restart service FAILED {} {}'.format(name, attempt)) + raise Exception( + 'Failed to restart {}: {} {}'.format(name, out, err) + ) + time.sleep(1) def stop_service(name): From 6966d46e51c95b04712e09ac30983eaf11297d25 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Thu, 26 Aug 2021 15:26:11 +0200 Subject: [PATCH 037/133] fix(LinstorSR): robustify linstor-manager to never include from plugins path Signed-off-by: Ronan Abhamon --- drivers/linstor-manager | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/linstor-manager b/drivers/linstor-manager index dcd4bc6f..f12747f1 100755 --- a/drivers/linstor-manager +++ b/drivers/linstor-manager @@ -14,14 +14,18 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . +# We must modify default import path, we don't want to import modules +# installed in plugins folder and instead we must import from LINSTOR driver +# folder. +import sys +sys.path[0] = '/opt/xensource/sm/' + import base64 import distutils.util import time import subprocess -import sys import XenAPIPlugin -sys.path.append('/opt/xensource/sm/') from linstorjournaler import LinstorJournaler from linstorvolumemanager import get_controller_uri, LinstorVolumeManager from lock import Lock From 5f0a8ecb30ba6a7dafeee19997783a8d19944074 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Thu, 26 Aug 2021 16:52:01 +0200 Subject: [PATCH 038/133] fix(LinstorSR): prevent starting controller during fail in linstor manager destroy method Signed-off-by: Ronan Abhamon --- drivers/linstorvolumemanager.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py index 0357b92d..e9b7c2f3 100755 --- a/drivers/linstorvolumemanager.py +++ b/drivers/linstorvolumemanager.py @@ -1263,9 +1263,11 @@ def destroy(self): 'It exists remaining volumes' ) + controller_is_running = self._controller_is_running() uri = 'linstor://localhost' try: - self._start_controller(start=False) + if controller_is_running: + self._start_controller(start=False) # 1. Umount LINSTOR database. self._mount_database_volume( @@ -1290,7 +1292,7 @@ def destroy(self): self._linstor, pool.name, pool.node_name ) except Exception as e: - self._start_controller(start=True) + self._start_controller(start=controller_is_running) raise e try: @@ -2503,6 +2505,10 @@ def _check_errors(result, codes): return True return False + @classmethod + def _controller_is_running(cls): + return cls._service_is_running('linstor-controller') + @classmethod def _start_controller(cls, start=True): return cls._start_service('linstor-controller', start) @@ -2519,6 +2525,13 @@ def _start_service(name, start=True): .format(action, name, out, err) ) + @staticmethod + def _service_is_running(name): + (ret, out, err) = util.doexec([ + 'systemctl', 'is-active', '--quiet', name + ]) + return not ret + @staticmethod def _is_mounted(mountpoint): (ret, out, err) = util.doexec(['mountpoint', '-q', mountpoint]) From 4dbab1dced46ec61542a138aa96af402632a4bbb Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Tue, 19 Oct 2021 14:48:17 +0200 Subject: [PATCH 039/133] feat(LinstorVolumeManager): increase peer slots limit (support 31 connections to a DRBD) - Also, create diskless devices when db is created Signed-off-by: Ronan Abhamon --- drivers/linstorvolumemanager.py | 84 ++++++++++++++++++++++++++------- 1 file changed, 67 insertions(+), 17 deletions(-) diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py index e9b7c2f3..553e2f50 100755 --- a/drivers/linstorvolumemanager.py +++ b/drivers/linstorvolumemanager.py @@ -1507,7 +1507,7 @@ def _create_sr( try: logger('Creating database volume...') volume_path = cls._create_database_volume( - lin, group_name, auto_quorum + lin, group_name, node_names, redundancy, auto_quorum ) except LinstorVolumeManagerError as e: if e.code != LinstorVolumeManagerError.ERR_VOLUME_EXISTS: @@ -1786,18 +1786,32 @@ def _create_volume( size = self.round_up_volume_size(size) self._mark_resource_cache_as_dirty() - # A. Basic case when we use the default redundancy of the group. - if not no_diskless: + def create_definition(): self._check_volume_creation_errors( self._linstor.resource_group_spawn( rsc_grp_name=self._group_name, rsc_dfn_name=volume_name, vlm_sizes=['{}B'.format(size)], - definitions_only=not place_resources + definitions_only=True ), volume_uuid, self._group_name ) + self._increase_volume_peer_slots(self._linstor, volume_name) + + # A. Basic case when we use the default redundancy of the group. + if not no_diskless: + create_definition() + if place_resources: + self._check_volume_creation_errors( + self._linstor.resource_auto_place( + rsc_name=volume_name, + place_count=self._redundancy, + diskless_on_remaining=not no_diskless + ), + volume_uuid, + self._group_name + ) return # B. Complex case. @@ -1829,17 +1843,7 @@ def clean(): def create(): try: - self._check_volume_creation_errors( - self._linstor.resource_group_spawn( - rsc_grp_name=self._group_name, - rsc_dfn_name=volume_name, - vlm_sizes=['{}B'.format(size)], - definitions_only=True - ), - volume_uuid, - self._group_name - ) - + create_definition() result = self._linstor.resource_create(resources) error_str = self._get_error_str(result) if error_str: @@ -2164,6 +2168,16 @@ def connect(uri): ] ) + @classmethod + def _increase_volume_peer_slots(cls, lin, volume_name): + result = lin.resource_dfn_modify(volume_name, {}, peer_slots=31) + error_str = cls._get_error_str(result) + if error_str: + raise LinstorVolumeManagerError( + 'Could not increase volume peer slots of {}: {}' + .format(volume_name, error_str) + ) + @classmethod def _activate_device_path(cls, lin, node_name, volume_name): result = lin.resource_create([ @@ -2215,7 +2229,9 @@ def _request_database_path(cls, lin, activate=False): return resources[0].volumes[0].device_path @classmethod - def _create_database_volume(cls, lin, group_name, auto_quorum): + def _create_database_volume( + cls, lin, group_name, node_names, redundancy, auto_quorum + ): try: dfns = lin.resource_dfn_list_raise().resource_definitions except Exception as e: @@ -2242,13 +2258,40 @@ def _create_database_volume(cls, lin, group_name, auto_quorum): .format(e) ) + # Create the database definition. size = cls.round_up_volume_size(DATABASE_SIZE) cls._check_volume_creation_errors(lin.resource_group_spawn( rsc_grp_name=group_name, rsc_dfn_name=DATABASE_VOLUME_NAME, vlm_sizes=['{}B'.format(size)], - definitions_only=False + definitions_only=True ), DATABASE_VOLUME_NAME, group_name) + cls._increase_volume_peer_slots(lin, DATABASE_VOLUME_NAME) + + # Create real resources on the first nodes. + resources = [] + for node_name in node_names[:redundancy]: + resources.append(linstor.ResourceData( + node_name=node_name, + rsc_name=DATABASE_VOLUME_NAME, + storage_pool=group_name + )) + # Create diskless resources on the remaining set. + for node_name in node_names[redundancy:]: + resources.append(linstor.ResourceData( + node_name=node_name, + rsc_name=DATABASE_VOLUME_NAME, + diskless=True + )) + + result = lin.resource_create(resources) + error_str = cls._get_error_str(result) + if error_str: + raise LinstorVolumeManagerError( + 'Could not create database volume from SR `{}`: {}'.format( + group_name, error_str + ) + ) # We must modify the quorum. Otherwise we can't use correctly the # minidrbdcluster daemon. @@ -2264,8 +2307,15 @@ def _create_database_volume(cls, lin, group_name, auto_quorum): .format(error_str) ) + # Create database and ensure path exists locally and + # on replicated devices. current_device_path = cls._request_database_path(lin, activate=True) + # Ensure diskless paths exist on other hosts. Otherwise PBDs can't be + # plugged. + for node_name in node_names: + cls._activate_device_path(lin, node_name, DATABASE_VOLUME_NAME) + # We use realpath here to get the /dev/drbd path instead of # /dev/drbd/by-res/. expected_device_path = cls.build_device_path(DATABASE_VOLUME_NAME) From 7d60255c8c19a363d687ca9ac2a2e764a41c179e Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Wed, 20 Oct 2021 14:33:04 +0200 Subject: [PATCH 040/133] feat(LinstorVolumeManager): add a fallback to find controller uri (when len(hosts) >= 4) Signed-off-by: Ronan Abhamon --- drivers/linstor-manager | 10 +++++++- drivers/linstorvolumemanager.py | 42 ++++++++++++++++++++++++--------- 2 files changed, 40 insertions(+), 12 deletions(-) diff --git a/drivers/linstor-manager b/drivers/linstor-manager index f12747f1..afc4bfe5 100755 --- a/drivers/linstor-manager +++ b/drivers/linstor-manager @@ -364,6 +364,13 @@ def lsof_resource(session, args): raise +def has_controller_running(session, args): + (ret, stdout, stderr) = util.doexec([ + 'systemctl', 'is-active', '--quiet', 'linstor-controller' + ]) + return str(ret == 0) + + if __name__ == '__main__': XenAPIPlugin.dispatch({ 'prepareSr': prepare_sr, @@ -382,5 +389,6 @@ if __name__ == '__main__': 'getKeyHash': get_key_hash, 'getBlockBitmap': get_block_bitmap, 'lockVdi': lock_vdi, - 'lsofResource': lsof_resource + 'lsofResource': lsof_resource, + 'hasControllerRunning': has_controller_running }) diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py index 553e2f50..821ef420 100755 --- a/drivers/linstorvolumemanager.py +++ b/drivers/linstorvolumemanager.py @@ -16,6 +16,7 @@ # +import distutils.util import errno import glob import json @@ -41,11 +42,12 @@ DRBD_BY_RES_PATH = '/dev/drbd/by-res/' +PLUGIN = 'linstor-manager' + # Check if a path is a DRBD resource and log the process name/pid # that opened it. def log_lsof_drbd(path): - PLUGIN = 'linstor-manager' PLUGIN_CMD = 'lsofResource' # Ignore if it's not a symlink to DRBD resource. @@ -159,21 +161,39 @@ def get_remote_host_ip(node_name): def _get_controller_uri(): + PLUGIN_CMD = 'hasControllerRunning' + + # Try to find controller using drbdadm. (ret, stdout, stderr) = util.doexec([ 'drbdadm', 'status', DATABASE_VOLUME_NAME ]) - if ret != 0: - return + if ret == 0: + # If we are here, the database device exists locally. - if stdout.startswith('{} role:Primary'.format(DATABASE_VOLUME_NAME)): - return 'linstor://localhost' + if stdout.startswith('{} role:Primary'.format(DATABASE_VOLUME_NAME)): + # Nice case, we have the controller running on this local host. + return 'linstor://localhost' - res = REG_DRBDADM_PRIMARY.search(stdout) - if res: - node_name = res.groups()[0] - ip = get_remote_host_ip(node_name) - if ip: - return 'linstor://' + ip + # Try to find the host using DRBD connections. + res = REG_DRBDADM_PRIMARY.search(stdout) + if res: + node_name = res.groups()[0] + ip = get_remote_host_ip(node_name) + if ip: + return 'linstor://' + ip + + # Worst case: we use many hosts in the pool (>= 4), so we can't find the + # primary using drbdadm because we don't have all connections to the + # replicated volume. `drbdadm status xcp-persistent-database` returns + # 3 connections by default. + session = util.get_localAPI_session() + for host_ref, host_record in session.xenapi.host.get_all_records().items(): + if distutils.util.strtobool( + session.xenapi.host.call_plugin(host_ref, PLUGIN, PLUGIN_CMD, {}) + ): + return 'linstor://' + host_record['hostname'] + + # Not found, maybe we are trying to create the SR... def get_controller_uri(): From f67684163ec4f99067023cc1a9212e28fa4d7a66 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Thu, 21 Oct 2021 11:13:07 +0200 Subject: [PATCH 041/133] fix(var-lib-linstor.mount): ensure we always mount database with RW flags Sometimes systemd fallback to read only FS if the volume can't be mounted, we must forbid that. It's probably a DRBD error. Signed-off-by: Ronan Abhamon --- Makefile | 2 +- drivers/linstor-manager | 4 ++-- etc/minidrbdcluster.ini | 2 +- etc/systemd/system/var-lib-linstor.mount | 6 ------ etc/systemd/system/var-lib-linstor.service | 21 +++++++++++++++++++++ 5 files changed, 25 insertions(+), 10 deletions(-) delete mode 100644 etc/systemd/system/var-lib-linstor.mount create mode 100644 etc/systemd/system/var-lib-linstor.service diff --git a/Makefile b/Makefile index a8d9253c..dfb1d440 100755 --- a/Makefile +++ b/Makefile @@ -181,7 +181,7 @@ install: precheck $(SM_STAGING)/$(LOGROTATE_DIR) install -m 644 etc/systemd/system/linstor-satellite.service.d/override.conf \ $(SM_STAGING)/$(SYSTEMD_CONF_DIR)/linstor-satellite.service.d/ - install -m 644 etc/systemd/system/var-lib-linstor.mount \ + install -m 644 etc/systemd/system/var-lib-linstor.service \ $(SM_STAGING)/$(SYSTEMD_CONF_DIR) install -m 644 etc/minidrbdcluster.ini \ $(SM_STAGING)/$(MINI_DRBD_CLUSTER_CONF_DIR) diff --git a/drivers/linstor-manager b/drivers/linstor-manager index afc4bfe5..af8d2b9e 100755 --- a/drivers/linstor-manager +++ b/drivers/linstor-manager @@ -187,7 +187,7 @@ def destroy(session, args): # When destroy is called, there are no running minidrbdcluster daemons. # So the controllers are stopped too, we must start an instance. - restart_service('var-lib-linstor.mount') + restart_service('var-lib-linstor.service') restart_service('linstor-controller') linstor = LinstorVolumeManager( @@ -199,7 +199,7 @@ def destroy(session, args): return str(True) except Exception as e: stop_service('linstor-controller') - stop_service('var-lib-linstor.mount') + stop_service('var-lib-linstor.service') util.SMlog('linstor-manager:destroy error: {}'.format(e)) return str(False) diff --git a/etc/minidrbdcluster.ini b/etc/minidrbdcluster.ini index 0126e862..9e523427 100644 --- a/etc/minidrbdcluster.ini +++ b/etc/minidrbdcluster.ini @@ -5,7 +5,7 @@ # section name the systemd-units to activate on one of the nodes. [xcp-persistent-database] -systemd-units=var-lib-linstor.mount,linstor-controller.service +systemd-units=var-lib-linstor.service,linstor-controller.service [xcp-persistent-ha-statefile] systemd-units= diff --git a/etc/systemd/system/var-lib-linstor.mount b/etc/systemd/system/var-lib-linstor.mount deleted file mode 100644 index a05a7f74..00000000 --- a/etc/systemd/system/var-lib-linstor.mount +++ /dev/null @@ -1,6 +0,0 @@ -[Unit] -Description=Filesystem for the LINSTOR controller - -[Mount] -What=/dev/drbd/by-res/xcp-persistent-database/0 -Where=/var/lib/linstor diff --git a/etc/systemd/system/var-lib-linstor.service b/etc/systemd/system/var-lib-linstor.service new file mode 100644 index 00000000..d230d048 --- /dev/null +++ b/etc/systemd/system/var-lib-linstor.service @@ -0,0 +1,21 @@ +# Regarding the current version of systemd (v.219) used in XCP-ng, we can't use +# the ReadWriteOnly option (to apply the -w flag, it's not the same than -o rw). +# This file is a workaround to avoid RO. It must be replaced with the code below +# in a mount unit. Compatible with version >= 246. +# +# [Unit] +# Description=Filesystem for the LINSTOR controller +# +# [Mount] +# What=/dev/drbd/by-res/xcp-persistent-database/0 +# Where=/var/lib/linstor +# ReadWriteOnly=true + +[Unit] +Description=Mount filesystem for the LINSTOR controller + +[Service] +Type=oneshot +ExecStart=/bin/mount -w /dev/drbd/by-res/xcp-persistent-database/0 /var/lib/linstor +ExecStop=/bin/umount /var/lib/linstor +RemainAfterExit=true From 50b4a06c119ffc3af774aaa1b4a3d3dce60cc415 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Thu, 21 Oct 2021 11:51:32 +0200 Subject: [PATCH 042/133] feat(LinstorVolumeManager): add a fallback to find node name (when len(hosts) >= 4) Signed-off-by: Ronan Abhamon --- drivers/linstorvolumemanager.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py index 821ef420..e497afa6 100755 --- a/drivers/linstorvolumemanager.py +++ b/drivers/linstorvolumemanager.py @@ -210,19 +210,26 @@ def get_controller_uri(): def get_controller_node_name(): + PLUGIN_CMD = 'hasControllerRunning' + (ret, stdout, stderr) = util.doexec([ 'drbdadm', 'status', DATABASE_VOLUME_NAME ]) - if ret != 0: - return - if stdout.startswith('{} role:Primary'.format(DATABASE_VOLUME_NAME)): - return 'localhost' + if ret == 0: + if stdout.startswith('{} role:Primary'.format(DATABASE_VOLUME_NAME)): + return 'localhost' - res = REG_DRBDADM_PRIMARY.search(stdout) - if res: - return res.groups()[0] + res = REG_DRBDADM_PRIMARY.search(stdout) + if res: + return res.groups()[0] + session = util.get_localAPI_session() + for host_ref, host_record in session.xenapi.host.get_all_records().items(): + if distutils.util.strtobool( + session.xenapi.host.call_plugin(host_ref, PLUGIN, PLUGIN_CMD, {}) + ): + return host_record['hostname'] # ============================================================================== From 39ff8e5a31d4c2abe776ba14a9866160d7389e31 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Tue, 26 Oct 2021 10:44:00 +0200 Subject: [PATCH 043/133] feat(LinstorSR): explain on which host, plugins commands are executed Signed-off-by: Ronan Abhamon --- drivers/LinstorSR.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index 4b761b56..519afb29 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -775,13 +775,25 @@ def _shared_lock_vdi(self, vdi_uuid, locked=True): # Network. # -------------------------------------------------------------------------- - def _exec_manager_command(self, host, command, args, error): - ret = self.session.xenapi.host.call_plugin( - host, self.MANAGER_PLUGIN, command, args - ) + def _exec_manager_command(self, host_ref, command, args, error): + host_rec = self.session.xenapi.host.get_record(host_ref) + host_uuid = host_rec['uuid'] + + try: + ret = self.session.xenapi.host.call_plugin( + host_ref, self.MANAGER_PLUGIN, command, args + ) + except Exception as e: + util.SMlog( + 'call-plugin on {} ({}:{} with {}) raised'.format( + host_uuid, self.MANAGER_PLUGIN, command, args + ) + ) + raise e + util.SMlog( - 'call-plugin ({}:{} with {}) returned: {}'.format( - self.MANAGER_PLUGIN, command, args, ret + 'call-plugin on {} ({}:{} with {}) returned: {}'.format( + host_uuid, self.MANAGER_PLUGIN, command, args, ret ) ) if ret == 'False': From db287aa3c734dedb2c0957fd6b829503202ef97b Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Wed, 3 Nov 2021 14:59:31 +0100 Subject: [PATCH 044/133] fix(LinstorSR): create diskless path if necessary during VDI loading Signed-off-by: Ronan Abhamon --- drivers/LinstorSR.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index 519afb29..15b9dda3 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -1899,7 +1899,16 @@ def _load_this(self): self.size = volume_info.virtual_size self.parent = '' else: - vhd_info = self.sr._vhdutil.get_vhd_info(self.uuid) + try: + vhd_info = self.sr._vhdutil.get_vhd_info(self.uuid) + except util.CommandException as e: + if e.code != errno.ENOENT: + raise + # Path doesn't exist. Probably a diskless without local path. + # Force creation and retry. + self._linstor.get_device_path(self.uuid) + vhd_info = self.sr._vhdutil.get_vhd_info(self.uuid) + self.hidden = vhd_info.hidden self.size = vhd_info.sizeVirt self.parent = vhd_info.parentUuid From cc24281c071d841e6e57cc5d0bfd9f5d3438984c Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Thu, 12 May 2022 17:52:35 +0200 Subject: [PATCH 045/133] feat(LinstorSR): use HTTP/NBD instead of DRBD directly with heartbeat VDI Signed-off-by: Ronan Abhamon --- Makefile | 1 + drivers/LinstorSR.py | 380 ++++++++++++++++++++++++++++---- drivers/linstor-manager | 43 +--- drivers/linstorvhdutil.py | 6 +- drivers/linstorvolumemanager.py | 17 +- drivers/util.py | 49 ++++ scripts/fork-log-daemon | 34 +++ 7 files changed, 438 insertions(+), 92 deletions(-) create mode 100755 scripts/fork-log-daemon diff --git a/Makefile b/Makefile index dfb1d440..89f7762c 100755 --- a/Makefile +++ b/Makefile @@ -238,6 +238,7 @@ install: precheck install -m 755 drivers/iscsilib.py $(SM_STAGING)$(SM_DEST) install -m 755 drivers/fcoelib.py $(SM_STAGING)$(SM_DEST) mkdir -p $(SM_STAGING)$(LIBEXEC) + install -m 755 scripts/fork-log-daemon $(SM_STAGING)$(LIBEXEC) install -m 755 scripts/local-device-change $(SM_STAGING)$(LIBEXEC) install -m 755 scripts/check-device-sharing $(SM_STAGING)$(LIBEXEC) install -m 755 scripts/usb_change $(SM_STAGING)$(LIBEXEC) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index 15b9dda3..5bdf6769 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -34,9 +34,13 @@ import distutils import errno import functools +import os +import re import scsiutil +import signal import SR import SRCommand +import subprocess import time import traceback import util @@ -52,6 +56,8 @@ HIDDEN_TAG = 'hidden' +FORK_LOG_DAEMON = '/opt/xensource/libexec/fork-log-daemon' + # ============================================================================== # TODO: Supports 'VDI_INTRODUCE', 'VDI_RESET_ON_BOOT/2', 'SR_TRIM', @@ -354,7 +360,9 @@ def wrapped_method(self, *args, **kwargs): def load(self, *args, **kwargs): if not self._has_session: - if self.srcmd.cmd == 'vdi_attach_from_config': + if self.srcmd.cmd in ( + 'vdi_attach_from_config', 'vdi_detach_from_config' + ): # We must have a valid LINSTOR instance here without using # the XAPI. controller_uri = get_controller_uri() @@ -1444,7 +1452,7 @@ def raise_bad_load(e): if ( self.sr.srcmd.cmd == 'vdi_attach_from_config' or self.sr.srcmd.cmd == 'vdi_detach_from_config' - ) and self.sr.srcmd.params['vdi_uuid'] == self.uuid: + ): self.vdi_type = vhdutil.VDI_TYPE_RAW self.path = self.sr.srcmd.params['vdi_path'] else: @@ -1529,7 +1537,7 @@ def create(self, sr_uuid, vdi_uuid, size): self._linstor.create_volume( self.uuid, volume_size, persistent=False, - volume_name=volume_name, no_diskless=(volume_name is not None) + volume_name=volume_name ) volume_info = self._linstor.get_volume_info(self.uuid) @@ -1631,8 +1639,9 @@ def delete(self, sr_uuid, vdi_uuid, data_only=False): def attach(self, sr_uuid, vdi_uuid): util.SMlog('LinstorVDI.attach for {}'.format(self.uuid)) + attach_from_config = self.sr.srcmd.cmd == 'vdi_attach_from_config' if ( - self.sr.srcmd.cmd != 'vdi_attach_from_config' or + not attach_from_config or self.sr.srcmd.params['vdi_uuid'] != self.uuid ) and self.sr._journaler.has_entries(self.uuid): raise xs_errors.XenError( @@ -1641,50 +1650,54 @@ def attach(self, sr_uuid, vdi_uuid): 'scan SR first to trigger auto-repair' ) - writable = 'args' not in self.sr.srcmd.params or \ - self.sr.srcmd.params['args'][0] == 'true' + if not attach_from_config or self.sr._is_master: + writable = 'args' not in self.sr.srcmd.params or \ + self.sr.srcmd.params['args'][0] == 'true' - # We need to inflate the volume if we don't have enough place - # to mount the VHD image. I.e. the volume capacity must be greater - # than the VHD size + bitmap size. - need_inflate = True - if self.vdi_type == vhdutil.VDI_TYPE_RAW or not writable or \ - self.capacity >= compute_volume_size(self.size, self.vdi_type): - need_inflate = False - - if need_inflate: - try: - self._prepare_thin(True) - except Exception as e: - raise xs_errors.XenError( - 'VDIUnavailable', - opterr='Failed to attach VDI during "prepare thin": {}' - .format(e) - ) + # We need to inflate the volume if we don't have enough place + # to mount the VHD image. I.e. the volume capacity must be greater + # than the VHD size + bitmap size. + need_inflate = True + if ( + self.vdi_type == vhdutil.VDI_TYPE_RAW or + not writable or + self.capacity >= compute_volume_size(self.size, self.vdi_type) + ): + need_inflate = False - if not util.pathexists(self.path): - raise xs_errors.XenError( - 'VDIUnavailable', opterr='Could not find: {}'.format(self.path) - ) + if need_inflate: + try: + self._prepare_thin(True) + except Exception as e: + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='Failed to attach VDI during "prepare thin": {}' + .format(e) + ) if not hasattr(self, 'xenstore_data'): self.xenstore_data = {} + self.xenstore_data['storage-type'] = LinstorSR.DRIVER_TYPE - # TODO: Is it useful? - self.xenstore_data.update(scsiutil.update_XS_SCSIdata( - self.uuid, scsiutil.gen_synthetic_page_data(self.uuid) - )) + if attach_from_config and self.path.startswith('/dev/http-nbd/'): + return self._attach_using_http_nbd() - self.xenstore_data['storage-type'] = LinstorSR.DRIVER_TYPE + if not util.pathexists(self.path): + raise xs_errors.XenError( + 'VDIUnavailable', opterr='Could not find: {}'.format(self.path) + ) self.attached = True - return VDI.VDI.attach(self, self.sr.uuid, self.uuid) def detach(self, sr_uuid, vdi_uuid): util.SMlog('LinstorVDI.detach for {}'.format(self.uuid)) + detach_from_config = self.sr.srcmd.cmd == 'vdi_detach_from_config' self.attached = False + if detach_from_config and self.path.startswith('/dev/http-nbd/'): + return self._detach_using_http_nbd() + if self.vdi_type == vhdutil.VDI_TYPE_RAW: return @@ -1816,25 +1829,40 @@ def generate_config(self, sr_uuid, vdi_uuid): util.SMlog('LinstorVDI.generate_config for {}'.format(self.uuid)) - if not self.path or not util.pathexists(self.path): - available = False - # Try to refresh symlink path... - try: - self.path = self._linstor.get_device_path(vdi_uuid) - available = util.pathexists(self.path) - except Exception: - pass - if not available: - raise xs_errors.XenError('VDIUnavailable') - resp = {} resp['device_config'] = self.sr.dconf resp['sr_uuid'] = sr_uuid resp['vdi_uuid'] = self.uuid resp['sr_sm_config'] = self.sr.sm_config - resp['vdi_path'] = self.path resp['command'] = 'vdi_attach_from_config' + # By default, we generate a normal config. + # But if the disk is persistent, we must use a HTTP/NBD + # server to ensure we can always write or read data. + # Why? DRBD is unsafe when used with more than 4 hosts: + # We are limited to use 1 diskless and 3 full. + # We can't increase this limitation, so we use a NBD/HTTP device + # instead. + volume_name = self._linstor.get_volume_name(self.uuid) + if volume_name not in [ + 'xcp-persistent-ha-statefile', 'xcp-persistent-redo-log' + ]: + if not self.path or not util.pathexists(self.path): + available = False + # Try to refresh symlink path... + try: + self.path = self._linstor.get_device_path(vdi_uuid) + available = util.pathexists(self.path) + except Exception: + pass + if not available: + raise xs_errors.XenError('VDIUnavailable') + + resp['vdi_path'] = self.path + else: + # Axiom: DRBD device is present on at least one host. + resp['vdi_path'] = '/dev/http-nbd/' + volume_name + config = xmlrpclib.dumps(tuple([resp]), 'vdi_attach_from_config') return xmlrpclib.dumps((config,), "", True) @@ -2314,6 +2342,268 @@ def _snapshot(self, snap_type, cbtlog=None, cbt_consistency=None): return ret_vdi.get_params() + @staticmethod + def _start_persistent_http_server(volume_name): + null = None + pid_path = None + http_server = None + + try: + null = open(os.devnull, 'w') + + if volume_name == 'xcp-persistent-ha-statefile': + port = '8076' + else: + port = '8077' + + arguments = [ + 'http-disk-server', + '--disk', + '/dev/drbd/by-res/{}/0'.format(volume_name), + '--port', + port + ] + + util.SMlog('Starting {} on port {}...'.format(arguments[0], port)) + http_server = subprocess.Popen( + [FORK_LOG_DAEMON] + arguments, + stdout=null, + stderr=null, + # Ensure we use another group id to kill this process without + # touch the current one. + preexec_fn=os.setsid + ) + + pid_path = '/run/http-server-{}.pid'.format(volume_name) + with open(pid_path, 'w') as pid_file: + pid_file.write(str(http_server.pid)) + except Exception as e: + if pid_path: + try: + os.remove(pid_path) + except Exception: + pass + + if http_server: + # Kill process and children in this case... + os.killpg(os.getpgid(http_server.pid), signal.SIGTERM) + + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='Failed to start http-server: {}'.format(e) + ) + finally: + if null: + null.close() + + def _start_persistent_nbd_server(self, volume_name): + null = None + pid_path = None + nbd_path = None + nbd_server = None + + try: + null = open(os.devnull, 'w') + + if volume_name == 'xcp-persistent-ha-statefile': + port = '8076' + else: + port = '8077' + + arguments = [ + 'nbd-http-server', + '--socket-path', + '/run/{}.socket'.format(volume_name), + '--nbd-name', + volume_name, + '--urls', + ','.join(map( + lambda host: "http://" + host + ':' + port, + self.sr._hosts + )) + ] + + util.SMlog('Starting {} using port {}...'.format(arguments[0], port)) + nbd_server = subprocess.Popen( + [FORK_LOG_DAEMON] + arguments, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + # Ensure we use another group id to kill this process without + # touch the current one. + preexec_fn=os.setsid + ) + + reg_nbd_path = re.compile("^NBD `(/dev/nbd[0-9]+)` is now attached.$") + def get_nbd_path(): + while nbd_server.poll() is None: + line = nbd_server.stdout.readline() + match = reg_nbd_path.match(line) + if match: + return match.group(1) + # Use a timeout to never block the smapi if there is a problem. + try: + nbd_path = util.timeout_call(10, get_nbd_path) + if nbd_path is None: + raise Exception('Empty NBD path (NBD server is probably dead)') + except util.TimeoutException: + raise Exception('Unable to read NBD path') + + pid_path = '/run/nbd-server-{}.pid'.format(volume_name) + with open(pid_path, 'w') as pid_file: + pid_file.write(str(nbd_server.pid)) + + util.SMlog('Create symlink: {} -> {}'.format(self.path, nbd_path)) + os.symlink(nbd_path, self.path) + except Exception as e: + if pid_path: + try: + os.remove(pid_path) + except Exception: + pass + + if nbd_path: + try: + os.remove(nbd_path) + except Exception: + pass + + if nbd_server: + # Kill process and children in this case... + os.killpg(os.getpgid(nbd_server.pid), signal.SIGTERM) + + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='Failed to start nbd-server: {}'.format(e) + ) + finally: + if null: + null.close() + + @classmethod + def _kill_persistent_server(self, type, volume_name, sig): + try: + path = '/run/{}-server-{}.pid'.format(type, volume_name) + if not os.path.exists(path): + return + + pid = None + with open(path, 'r') as pid_file: + try: + pid = int(pid_file.read()) + except Exception: + pass + + if pid is not None and util.check_pid_exists(pid): + util.SMlog('Kill {} server {} (pid={})'.format(type, path, pid)) + try: + os.killpg(os.getpgid(pid), sig) + except Exception as e: + util.SMlog('Failed to kill {} server: {}'.format(type, e)) + + os.remove(path) + except: + pass + + @classmethod + def _kill_persistent_http_server(self, volume_name, sig=signal.SIGTERM): + return self._kill_persistent_server('nbd', volume_name, sig) + + @classmethod + def _kill_persistent_nbd_server(self, volume_name, sig=signal.SIGTERM): + return self._kill_persistent_server('http', volume_name, sig) + + def _check_http_nbd_volume_name(self): + volume_name = self.path[14:] + if volume_name not in [ + 'xcp-persistent-ha-statefile', 'xcp-persistent-redo-log' + ]: + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='Unsupported path: {}'.format(self.path) + ) + return volume_name + + def _attach_using_http_nbd(self): + volume_name = self._check_http_nbd_volume_name() + + # Ensure there is no NBD and HTTP server running. + self._kill_persistent_nbd_server(volume_name) + self._kill_persistent_http_server(volume_name) + + # 0. Fetch drbd path. + must_get_device_path = True + if not self.sr._is_master: + # We are on a slave, we must try to find a diskful locally. + try: + volume_info = self._linstor.get_volume_info(self.uuid) + except Exception as e: + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='Cannot get volume info of {}: {}' + .format(self.uuid, e) + ) + + must_get_device_path = volume_info.is_diskful + + drbd_path = None + if must_get_device_path or self.sr._is_master: + # If we are master, we must ensure we have a diskless + # or diskful available to init HA. + # It also avoid this error in xensource.log + # (/usr/libexec/xapi/cluster-stack/xhad/ha_set_pool_state): + # init exited with code 8 [stdout = ''; stderr = 'SF: failed to write in State-File \x10 (fd 4208696). (sys 28)\x0A'] + # init returned MTC_EXIT_CAN_NOT_ACCESS_STATEFILE (State-File is inaccessible) + available = False + try: + drbd_path = self._linstor.get_device_path(self.uuid) + available = util.pathexists(drbd_path) + except Exception: + pass + + if not available: + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='Cannot get device path of {}'.format(self.uuid) + ) + + # 1. Prepare http-nbd folder. + try: + if not os.path.exists('/dev/http-nbd/'): + os.makedirs('/dev/http-nbd/') + elif os.path.islink(self.path): + os.remove(self.path) + except OSError as e: + if e.errno != errno.EEXIST: + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='Cannot prepare http-nbd: {}'.format(e) + ) + + # 2. Start HTTP service if we have a diskful or if we are master. + http_service = None + if drbd_path: + assert(drbd_path in ( + '/dev/drbd/by-res/xcp-persistent-ha-statefile/0', + '/dev/drbd/by-res/xcp-persistent-redo-log/0' + )) + self._start_persistent_http_server(volume_name) + + # 3. Start NBD server in all cases. + try: + self._start_persistent_nbd_server(volume_name) + except Exception as e: + if drbd_path: + self._kill_persistent_http_server(volume_name) + raise + + self.attached = True + return VDI.VDI.attach(self, self.sr.uuid, self.uuid) + + def _detach_using_http_nbd(self): + volume_name = self._check_http_nbd_volume_name() + self._kill_persistent_nbd_server(volume_name) + self._kill_persistent_http_server(volume_name) + # ------------------------------------------------------------------------------ diff --git a/drivers/linstor-manager b/drivers/linstor-manager index af8d2b9e..30230adb 100755 --- a/drivers/linstor-manager +++ b/drivers/linstor-manager @@ -36,7 +36,7 @@ import vhdutil FIREWALL_PORT_SCRIPT = '/etc/xapi.d/plugins/firewall-port' -LINSTOR_PORTS = [3366, 3370, 3376, 3377, '7000:8000'] +LINSTOR_PORTS = [3366, 3370, 3376, 3377, '7000:8000', 8076, 8077] def update_port(port, open): @@ -56,39 +56,6 @@ def update_all_ports(open): update_port(port, open) -def enable_and_start_service(name, start): - attempt = 0 - while True: - attempt += 1 - fn = 'enable' if start else 'disable' - args = ('systemctl', fn, '--now', name) - (ret, out, err) = util.doexec(args) - if ret == 0: - return - elif attempt >= 3: - raise Exception( - 'Failed to {} {}: {} {}'.format(fn, name, out, err) - ) - time.sleep(1) - - -def restart_service(name): - attempt = 0 - while True: - attempt += 1 - util.SMlog('linstor-manager:restart service {} {}...'.format(name, attempt)) - args = ('systemctl', 'restart', name) - (ret, out, err) = util.doexec(args) - if ret == 0: - return - elif attempt >= 3: - util.SMlog('linstor-manager:restart service FAILED {} {}'.format(name, attempt)) - raise Exception( - 'Failed to restart {}: {} {}'.format(name, out, err) - ) - time.sleep(1) - - def stop_service(name): args = ('systemctl', 'stop', name) (ret, out, err) = util.doexec(args) @@ -98,11 +65,11 @@ def stop_service(name): def update_linstor_satellite_service(start): - enable_and_start_service('linstor-satellite', start) + util.enable_and_start_service('linstor-satellite', start) def update_minidrbdcluster_service(start): - enable_and_start_service('minidrbdcluster', start) + util.enable_and_start_service('minidrbdcluster', start) def prepare_sr(session, args): @@ -187,8 +154,8 @@ def destroy(session, args): # When destroy is called, there are no running minidrbdcluster daemons. # So the controllers are stopped too, we must start an instance. - restart_service('var-lib-linstor.service') - restart_service('linstor-controller') + util.restart_service('var-lib-linstor.service') + util.restart_service('linstor-controller') linstor = LinstorVolumeManager( 'linstor://localhost', diff --git a/drivers/linstorvhdutil.py b/drivers/linstorvhdutil.py index ac858371..9ba0ac3b 100644 --- a/drivers/linstorvhdutil.py +++ b/drivers/linstorvhdutil.py @@ -42,7 +42,7 @@ def wrapper(*args, **kwargs): # Try to read locally if the device is not in use or if the device # is up to date and not diskless. (node_names, in_use) = \ - self._linstor.find_up_to_date_diskfull_nodes(vdi_uuid) + self._linstor.find_up_to_date_diskful_nodes(vdi_uuid) try: if not in_use or socket.gethostname() in node_names: @@ -217,7 +217,7 @@ def _extract_uuid(self, device_path): def _get_readonly_host(self, vdi_uuid, device_path, node_names): """ When vhd-util is called to fetch VDI info we must find a - diskfull DRBD disk to read the data. It's the goal of this function. + diskful DRBD disk to read the data. It's the goal of this function. Why? Because when a VHD is open in RO mode, the LVM layer is used directly to bypass DRBD verifications (we can have only one process that reads/writes to disk with DRBD devices). @@ -226,7 +226,7 @@ def _get_readonly_host(self, vdi_uuid, device_path, node_names): if not node_names: raise xs_errors.XenError( 'VDIUnavailable', - opterr='Unable to find diskfull node: {} (path={})' + opterr='Unable to find diskful node: {} (path={})' .format(vdi_uuid, device_path) ) diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py index e497afa6..da98e0b6 100755 --- a/drivers/linstorvolumemanager.py +++ b/drivers/linstorvolumemanager.py @@ -329,18 +329,21 @@ class VolumeInfo(object): __slots__ = ( 'name', 'allocated_size', # Allocated size, place count is not used. - 'virtual_size' # Total virtual available size of this volume - # (i.e. the user size at creation). + 'virtual_size', # Total virtual available size of this volume + # (i.e. the user size at creation). + 'is_diskful' ) def __init__(self, name): self.name = name self.allocated_size = 0 self.virtual_size = 0 + self.is_diskful = False def __repr__(self): - return 'VolumeInfo("{}", {}, {})'.format( - self.name, self.allocated_size, self.virtual_size + return 'VolumeInfo("{}", {}, {}, {})'.format( + self.name, self.allocated_size, self.virtual_size, + 'diskful' if self.is_diskful else 'diskless' ) # -------------------------------------------------------------------------- @@ -1332,9 +1335,9 @@ def destroy(self): .format(e) ) - def find_up_to_date_diskfull_nodes(self, volume_uuid): + def find_up_to_date_diskful_nodes(self, volume_uuid): """ - Find all nodes that contain a specific volume using diskfull disks. + Find all nodes that contain a specific volume using diskful disks. The disk must be up to data to be used. :param str volume_uuid: The volume to use. :return: The available nodes. @@ -1716,6 +1719,8 @@ def _get_volumes_info(self, volume_name=None): else: current = all_volume_info[resource.name] + current.is_diskful = linstor.consts.FLAG_DISKLESS not in resource.flags + for volume in resource.volumes: # We ignore diskless pools of the form "DfltDisklessStorPool". if volume.storage_pool_name == self._group_name: diff --git a/drivers/util.py b/drivers/util.py index 2edd888e..50b62858 100755 --- a/drivers/util.py +++ b/drivers/util.py @@ -1796,3 +1796,52 @@ def sessions_less_than_targets(other_config, device_config): else: return False + +def enable_and_start_service(name, start): + attempt = 0 + while True: + attempt += 1 + fn = 'enable' if start else 'disable' + args = ('systemctl', fn, '--now', name) + (ret, out, err) = doexec(args) + if ret == 0: + return + elif attempt >= 3: + raise Exception( + 'Failed to {} {}: {} {}'.format(fn, name, out, err) + ) + time.sleep(1) + + +def stop_service(name): + args = ('systemctl', 'stop', name) + (ret, out, err) = doexec(args) + if ret == 0: + return + raise Exception('Failed to stop {}: {} {}'.format(name, out, err)) + + +def restart_service(name): + attempt = 0 + while True: + attempt += 1 + SMlog('Restarting service {} {}...'.format(name, attempt)) + args = ('systemctl', 'restart', name) + (ret, out, err) = doexec(args) + if ret == 0: + return + elif attempt >= 3: + SMlog('Restart service FAILED {} {}'.format(name, attempt)) + raise Exception( + 'Failed to restart {}: {} {}'.format(name, out, err) + ) + time.sleep(1) + + +def check_pid_exists(pid): + try: + os.kill(pid, 0) + except OSError: + return False + else: + return True diff --git a/scripts/fork-log-daemon b/scripts/fork-log-daemon new file mode 100755 index 00000000..eb0f0b0f --- /dev/null +++ b/scripts/fork-log-daemon @@ -0,0 +1,34 @@ +#!/usr/bin/env python + +import select +import subprocess +import sys +import syslog + +def main(): + process = subprocess.Popen(sys.argv[1:], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + write_to_stdout = True + + while process.poll() is None: + while True: + output = process.stdout.readline() + if not output: + break + + if write_to_stdout: + try: + print(output) + sys.stdout.flush() + except Exception: + # Probably a broken pipe. So the process reading stdout is dead. + write_to_stdout = False + syslog.syslog(output) + +if __name__ == "__main__": + syslog.openlog(ident=sys.argv[1], facility=syslog.LOG_DAEMON) + try: + main() + except Exception as e: + syslog.syslog(sys.argv[1] + ' terminated with exception: {}'.format(e)) + finally: + syslog.syslog(sys.argv[1] + ' is now terminated!') From d0cfc907aa5e1dcf48742cf72616fb7da78e4a0e Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Thu, 3 Mar 2022 15:02:17 +0100 Subject: [PATCH 046/133] fix(LinstorSR): find controller when XAPI unreachable (XHA) Signed-off-by: Ronan Abhamon --- drivers/LinstorSR.py | 60 +++++++++++++++++++++++++++++---- drivers/linstorvolumemanager.py | 34 ++++++++++++------- 2 files changed, 74 insertions(+), 20 deletions(-) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index 5bdf6769..a4f7afce 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -46,6 +46,7 @@ import util import VDI import vhdutil +import xml.etree.ElementTree as xml_parser import xmlrpclib import xs_errors @@ -56,6 +57,8 @@ HIDDEN_TAG = 'hidden' +XHA_CONFIG_PATH = '/etc/xensource/xhad.conf' + FORK_LOG_DAEMON = '/opt/xensource/libexec/fork-log-daemon' # ============================================================================== @@ -248,6 +251,26 @@ def deflate(linstor, vdi_uuid, vdi_path, new_size, old_size): # TODO: Change the LINSTOR volume size using linstor.resize_volume. +def get_ips_from_xha_config_file(): + ips = [] + try: + # Ensure there is no dirty read problem. + # For example if the HA is reloaded. + tree = util.retry( + lambda: xml_parser.parse(XHA_CONFIG_PATH), + maxretry=10, + period=1 + ) + + for node in tree.getroot()[0]: + if node.tag == 'host': + for host_node in node: + if host_node.tag == 'IPaddress': + ips.append(host_node.text) + except: + pass + return ips + # ============================================================================== # Usage example: @@ -363,18 +386,41 @@ def load(self, *args, **kwargs): if self.srcmd.cmd in ( 'vdi_attach_from_config', 'vdi_detach_from_config' ): - # We must have a valid LINSTOR instance here without using - # the XAPI. + def create_linstor(uri, attempt_count=30): + self._linstor = LinstorVolumeManager( + uri, + self._group_name, + logger=util.SMlog, + attempt_count=attempt_count + ) + controller_uri = get_controller_uri() + if controller_uri: + create_linstor(controller_uri) + else: + def connect(): + # We must have a valid LINSTOR instance here without using + # the XAPI. Fallback with the HA config file. + for ip in get_ips_from_xha_config_file(): + controller_uri = 'linstor://' + ip + try: + util.SMlog('Connecting from config to LINSTOR controller using: {}'.format(ip)) + create_linstor(controller_uri, attempt_count=0) + return controller_uri + except: + pass + + controller_uri = util.retry(connect, maxretry=30, period=1) + if not controller_uri: + raise xs_errors.XenError( + 'SRUnavailable', + opterr='No valid controller URI to attach/detach from config' + ) + self._journaler = LinstorJournaler( controller_uri, self._group_name, logger=util.SMlog ) - self._linstor = LinstorVolumeManager( - controller_uri, - self._group_name, - logger=util.SMlog - ) return wrapped_method(self, *args, **kwargs) if not self._is_master: diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py index da98e0b6..b4ee7830 100755 --- a/drivers/linstorvolumemanager.py +++ b/drivers/linstorvolumemanager.py @@ -186,15 +186,16 @@ def _get_controller_uri(): # primary using drbdadm because we don't have all connections to the # replicated volume. `drbdadm status xcp-persistent-database` returns # 3 connections by default. - session = util.get_localAPI_session() - for host_ref, host_record in session.xenapi.host.get_all_records().items(): - if distutils.util.strtobool( - session.xenapi.host.call_plugin(host_ref, PLUGIN, PLUGIN_CMD, {}) - ): - return 'linstor://' + host_record['hostname'] - - # Not found, maybe we are trying to create the SR... - + try: + session = util.get_localAPI_session() + for host_ref, host_record in session.xenapi.host.get_all_records().items(): + if distutils.util.strtobool( + session.xenapi.host.call_plugin(host_ref, PLUGIN, PLUGIN_CMD, {}) + ): + return 'linstor://' + host_record['hostname'] + except: + # Not found, maybe we are trying to create the SR... + pass def get_controller_uri(): retries = 0 @@ -349,7 +350,8 @@ def __repr__(self): # -------------------------------------------------------------------------- def __init__( - self, uri, group_name, repair=False, logger=default_logger.__func__ + self, uri, group_name, repair=False, logger=default_logger.__func__, + attempt_count=30 ): """ Create a new LinstorVolumeManager object. @@ -358,9 +360,12 @@ def __init__( :param bool repair: If true we try to remove bad volumes due to a crash or unexpected behavior. :param function logger: Function to log messages. + :param int attempt_count: Number of attempts to join the controller. """ - self._linstor = self._create_linstor_instance(uri) + self._linstor = self._create_linstor_instance( + uri, attempt_count=attempt_count + ) self._base_group_name = group_name # Ensure group exists. @@ -2169,7 +2174,9 @@ def _get_error_str(cls, result): ]) @classmethod - def _create_linstor_instance(cls, uri, keep_uri_unmodified=False): + def _create_linstor_instance( + cls, uri, keep_uri_unmodified=False, attempt_count=30 + ): retry = False def connect(uri): @@ -2193,7 +2200,8 @@ def connect(uri): return util.retry( lambda: connect(uri), - maxretry=10, + maxretry=attempt_count, + period=1, exceptions=[ linstor.errors.LinstorNetworkError, LinstorVolumeManagerError From 08f3be9bcbaf6233fc1f1aef14e09e48136f5451 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Thu, 24 Mar 2022 18:13:46 +0100 Subject: [PATCH 047/133] fix(LinstorSR): use IPs instead of hostnames in NBD server Without this patch we can't use XCP-ng hosts configured with static IPS. Only servers with DHCP enabled can access the HTTP server(s) otherwise. Signed-off-by: Ronan Abhamon --- drivers/LinstorSR.py | 101 ++++++++++++++++++++++++++++++++++++------- drivers/util.py | 12 +++++ 2 files changed, 97 insertions(+), 16 deletions(-) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index a4f7afce..c0bfc3f1 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -251,8 +251,15 @@ def deflate(linstor, vdi_uuid, vdi_path, new_size, old_size): # TODO: Change the LINSTOR volume size using linstor.resize_volume. +IPS_XHA_CACHE = None + + def get_ips_from_xha_config_file(): - ips = [] + if IPS_XHA_CACHE: + return IPS_XHA_CACHE + + ips = dict() + host_ip = None try: # Ensure there is no dirty read problem. # For example if the HA is reloaded. @@ -261,15 +268,50 @@ def get_ips_from_xha_config_file(): maxretry=10, period=1 ) - - for node in tree.getroot()[0]: - if node.tag == 'host': - for host_node in node: - if host_node.tag == 'IPaddress': - ips.append(host_node.text) except: - pass - return ips + return (host_ip, ips) + + def parse_host_nodes(ips, node): + current_id = None + current_ip = None + + for sub_node in node: + if sub_node.tag == 'IPaddress': + current_ip = sub_node.text + elif sub_node.tag == 'HostID': + current_id = sub_node.text + else: + continue + + if current_id and current_ip: + ips[current_id] = current_ip + return + util.SMlog('Ill-formed XHA file, missing IPaddress or/and HostID') + + def parse_common_config(ips, node): + for sub_node in node: + if sub_node.tag == 'host': + parse_host_nodes(ips, sub_node) + + def parse_local_config(ips, node): + for sub_node in node: + if sub_node.tag == 'localhost': + for host_node in sub_node: + if host_node.tag == 'HostID': + return host_node.text + + for node in tree.getroot(): + if node.tag == 'common-config': + parse_common_config(ips, node) + elif node.tag == 'local-config': + host_ip = parse_local_config(ips, node) + else: + continue + + if ips and host_ip: + break + + return (host_ip, ips) # ============================================================================== @@ -401,7 +443,7 @@ def create_linstor(uri, attempt_count=30): def connect(): # We must have a valid LINSTOR instance here without using # the XAPI. Fallback with the HA config file. - for ip in get_ips_from_xha_config_file(): + for ip in get_ips_from_xha_config_file()[1].values(): controller_uri = 'linstor://' + ip try: util.SMlog('Connecting from config to LINSTOR controller using: {}'.format(ip)) @@ -2402,10 +2444,23 @@ def _start_persistent_http_server(volume_name): else: port = '8077' + try: + session = util.get_localAPI_session() + host_ip = util.get_this_host_address(session) + except: + # Fallback using the XHA file if session not available. + host_ip, _ = get_ips_from_xha_config_file() + if not host_ip: + raise Exception( + 'Cannot start persistent HTTP server: no XAPI session, nor XHA config file' + ) + arguments = [ 'http-disk-server', '--disk', '/dev/drbd/by-res/{}/0'.format(volume_name), + '--ip', + host_ip, '--port', port ] @@ -2432,7 +2487,10 @@ def _start_persistent_http_server(volume_name): if http_server: # Kill process and children in this case... - os.killpg(os.getpgid(http_server.pid), signal.SIGTERM) + try: + os.killpg(os.getpgid(http_server.pid), signal.SIGTERM) + except: + pass raise xs_errors.XenError( 'VDIUnavailable', @@ -2456,6 +2514,17 @@ def _start_persistent_nbd_server(self, volume_name): else: port = '8077' + try: + session = util.get_localAPI_session() + ips = util.get_host_addresses(session) + except Exception as e: + _, ips = get_ips_from_xha_config_file() + if not ips: + raise Exception( + 'Cannot start persistent NBD server: no XAPI session, nor XHA config file ({})'.format(e) + ) + ips = ips.values() + arguments = [ 'nbd-http-server', '--socket-path', @@ -2463,10 +2532,7 @@ def _start_persistent_nbd_server(self, volume_name): '--nbd-name', volume_name, '--urls', - ','.join(map( - lambda host: "http://" + host + ':' + port, - self.sr._hosts - )) + ','.join(map(lambda ip: 'http://' + ip + ':' + port, ips)) ] util.SMlog('Starting {} using port {}...'.format(arguments[0], port)) @@ -2515,7 +2581,10 @@ def get_nbd_path(): if nbd_server: # Kill process and children in this case... - os.killpg(os.getpgid(nbd_server.pid), signal.SIGTERM) + try: + os.killpg(os.getpgid(nbd_server.pid), signal.SIGTERM) + except: + pass raise xs_errors.XenError( 'VDIUnavailable', diff --git a/drivers/util.py b/drivers/util.py index 50b62858..484c5f70 100755 --- a/drivers/util.py +++ b/drivers/util.py @@ -700,6 +700,18 @@ def get_hosts_attached_on(session, vdi_uuids): host_refs[key[len('host_'):]] = True return host_refs.keys() +def get_this_host_address(session): + host_uuid = get_this_host() + host_ref = session.xenapi.host.get_by_uuid(host_uuid) + return session.xenapi.host.get_record(host_ref)['address'] + +def get_host_addresses(session): + addresses = [] + hosts = session.xenapi.host.get_all_records() + for record in hosts.itervalues(): + addresses.append(record['address']) + return addresses + def get_this_host_ref(session): host_uuid = get_this_host() host_ref = session.xenapi.host.get_by_uuid(host_uuid) From 08805e4c792e8e65c6a8137cdaf7d31bb1c5d014 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Thu, 31 Mar 2022 11:21:19 +0200 Subject: [PATCH 048/133] fix(LinstorVolumeManager): ensure we always use IPs in _get_controller_uri Otherwise if a hostname is returned, we can't use it if the XCP-ng pool is configurer using static IPs instead of DHCP. Signed-off-by: Ronan Abhamon --- drivers/linstorvolumemanager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py index b4ee7830..2d5c63ed 100755 --- a/drivers/linstorvolumemanager.py +++ b/drivers/linstorvolumemanager.py @@ -192,7 +192,7 @@ def _get_controller_uri(): if distutils.util.strtobool( session.xenapi.host.call_plugin(host_ref, PLUGIN, PLUGIN_CMD, {}) ): - return 'linstor://' + host_record['hostname'] + return 'linstor://' + host_record['address'] except: # Not found, maybe we are trying to create the SR... pass From 922aa7ddc63a2d825b42cce41ef4cb8c2c356c67 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Wed, 6 Apr 2022 17:53:02 +0200 Subject: [PATCH 049/133] feat(linstor-manager): add methods to add remove/host from LINSTOR SR Signed-off-by: Ronan Abhamon --- drivers/LinstorSR.py | 6 +- drivers/linstor-manager | 283 +++++++++++++++++++++++++++++++- drivers/linstorvolumemanager.py | 54 ++++++ 3 files changed, 338 insertions(+), 5 deletions(-) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index c0bfc3f1..413c5501 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -401,7 +401,9 @@ def load(self, sr_uuid): self._ips = None else: self._ips = self.dconf['ips'].split(',') - self._redundancy = int(self.dconf['redundancy'] or 1) + + if self.cmd == 'sr_create': + self._redundancy = int(self.dconf['redundancy']) or 1 self._linstor = None # Ensure that LINSTOR attribute exists. self._journaler = None @@ -1004,7 +1006,7 @@ def _update_physical_size(self): # ensures the displayed physical size is reachable by the user. self.physical_size = \ self._linstor.min_physical_size * len(self._hosts) / \ - self._redundancy + self._linstor.redundancy self.physical_utilisation = self._linstor.allocated_volume_size diff --git a/drivers/linstor-manager b/drivers/linstor-manager index 30230adb..7e34ce65 100755 --- a/drivers/linstor-manager +++ b/drivers/linstor-manager @@ -22,8 +22,7 @@ sys.path[0] = '/opt/xensource/sm/' import base64 import distutils.util -import time -import subprocess +import socket import XenAPIPlugin from linstorjournaler import LinstorJournaler @@ -72,6 +71,9 @@ def update_minidrbdcluster_service(start): util.enable_and_start_service('minidrbdcluster', start) +# ------------------------------------------------------------------------------ + + def prepare_sr(session, args): try: update_all_ports(open=True) @@ -338,6 +340,279 @@ def has_controller_running(session, args): return str(ret == 0) +def add_host(session, args): + group_name = args['groupName'] + + # 1. Find SR and PBDs. + srs = dict() + for sr_ref, sr in session.xenapi.SR.get_all_records().items(): + if sr.get('type') == 'linstor': + srs[sr_ref] = sr + + pbds = dict() + for pbd_ref, pbd in session.xenapi.PBD.get_all_records().items(): + device_config = pbd.get('device_config') + if ( + device_config and + device_config.get('group-name') == group_name + and pbd['SR'] in srs + ): + pbds[pbd_ref] = pbd + + # 2. Ensure there is at least one PBD and all PBDs are used in + # the same SR. + if not pbds: + raise Exception( + 'Failed to find PBDs of group `{}`'.format(group_name) + ) + + sr_ref = None + for pbd in pbds.values(): + if not sr_ref: + sr_ref = pbd['SR'] + elif pbd['SR'] != sr_ref: + raise Exception( + 'Group `{}` is used by many SRs!'.format(group_name) + ) + + # 3. Ensure node doesn't exist. + linstor = LinstorVolumeManager( + get_controller_uri(), + group_name, + logger=util.SMlog + ) + + node_name = socket.gethostname() + has_node = linstor.has_node(node_name) + + pbd_id = 0 + new_pbd_ref = None + + try: + # 4. Enable services. + update_all_ports(open=True) + update_minidrbdcluster_service(start=True) + update_linstor_satellite_service(start=True) + + # 5. Try to create local node. + if not has_node: + linstor.create_node(node_name, util.get_this_host_address(session)) + + # 6. Recreate PBDs. + # Use the redundancy given by Linstor instead of smapi config. + redundancy = linstor.redundancy + default_device_config = None + this_host = util.get_this_host_ref(session) + create_new_pbd = True + + assert pbds + pbds = pbds.items() + for pbd_ref, pbd in pbds: + device_config = pbd['device_config'] + + hosts = filter( + lambda host: len(host.strip()), + device_config.get('hosts', []).split(',') + ) + hosts.append(node_name) + hosts = ','.join(list(set(hosts))) + + # Should be the same on all hosts. + provisioning = device_config['provisioning'] + + if not default_device_config: + default_device_config = { + 'group-name': group_name, + 'redundancy': redundancy, + 'hosts': hosts, + 'provisioning': provisioning + } + + if pbd['currently_attached']: + session.xenapi.PBD.unplug(pbd_ref) + session.xenapi.PBD.destroy(pbd_ref) + pbd_id += 1 + + host = pbd['host'] + if host == this_host: + create_new_pbd = False + + pbd_ref = session.xenapi.PBD.create({ + 'host': host, + 'SR': sr_ref, + 'device_config': { + 'group-name': group_name, + 'redundancy': redundancy, + 'hosts': hosts, + 'provisioning': provisioning + } + }) + try: + session.xenapi.PBD.plug(pbd_ref) + except Exception as e: + util.SMlog('Failed to replug PBD: {}'.format(e)) + + # 7. Create new PBD. + if create_new_pbd: + new_pbd_ref = session.xenapi.PBD.create({ + 'host': this_host, + 'SR': sr_ref, + 'device_config': default_device_config + }) + try: + session.xenapi.PBD.plug(new_pbd_ref) + except Exception as e: + util.SMlog('Failed to plug new PBD: {}'.format(e)) + + return str(True) + except Exception as e: + stop_services = not has_node + if stop_services: + try: + linstor.destroy_node(node_name) + except Exception: + pass + + for pbd_ref, pbd in pbds[:pbd_id]: + try: + session.xenapi.PBD.unplug(pbd_ref) + except Exception: + pass + + try: + session.xenapi.PBD.destroy(pbd_ref) + except Exception: + pass + + try: + device_config = pbd['device_config'] + session.xenapi.PBD.create({ + 'host': host, + 'SR': sr_ref, + 'device_config': { + 'group-name': group_name, + 'redundancy': redundancy, + 'hosts': device_config['hosts'], + 'provisioning': device_config['provisioning'] + } + }) + except Exception as pbd_error: + util.SMlog('Failed to recreate PBD: {}'.format(pbd_error)) + pass + + try: + session.xenapi.PBD.plug(pbd_ref) + except Exception: + pass + + if new_pbd_ref: + try: + session.xenapi.PBD.unplug(new_pbd_ref) + except Exception: + pass + + try: + session.xenapi.PBD.destroy(new_pbd_ref) + except Exception: + pass + + try: + # If we failed to remove the node, we don't stop services. + if stop_services and not linstor.has_node(node_name): + update_linstor_satellite_service(start=False) + update_minidrbdcluster_service(start=False) + update_all_ports(open=False) + except Exception: + pass + + raise e + + +def remove_host(session, args): + group_name = args['groupName'] + force = args.get('force') or False + + # 1. Find SRs and PBDs. + srs = dict() + for sr_ref, sr in session.xenapi.SR.get_all_records().items(): + if sr.get('type') == 'linstor': + srs[sr_ref] = sr + + pbds = dict() + for pbd_ref, pbd in session.xenapi.PBD.get_all_records().items(): + device_config = pbd.get('device_config') + if ( + device_config and + device_config.get('group-name') == group_name + and pbd['SR'] in srs + ): + pbds[pbd_ref] = pbd + + # 2. Remove node. + linstor = LinstorVolumeManager( + get_controller_uri(), + group_name, + logger=util.SMlog + ) + + node_name = socket.gethostname() + if linstor.has_node(node_name): + linstor.destroy_node(node_name) + if linstor.has_node(node_name): + raise Exception('Failed to remove node! Unknown error.') + + redundancy = linstor.redundancy + this_host = util.get_this_host_ref(session) + + # 3. Update PBDs. + for pbd_ref, pbd in pbds.items(): + host = pbd['host'] + if host == this_host: + if pbd['currently_attached']: + session.xenapi.PBD.unplug(pbd_ref) + session.xenapi.PBD.destroy(pbd_ref) + continue + + device_config = pbd['device_config'] + hosts = device_config.get('hosts', []).split(',') + try: + hosts.remove(node_name) + except Exception as e: + continue + hosts = ','.join(list(set(hosts))) + + if pbd['currently_attached']: + session.xenapi.PBD.unplug(pbd_ref) + session.xenapi.PBD.destroy(pbd_ref) + + pbd_ref = session.xenapi.PBD.create({ + 'host': host, + 'SR': pbd['SR'], + 'device_config': { + 'group-name': group_name, + 'redundancy': redundancy, + 'hosts': hosts, + 'provisioning': device_config['provisioning'] + } + }) + + try: + session.xenapi.PBD.plug(pbd_ref) + except Exception as e: + util.SMlog('Failed to replug PBD: {}'.format(e)) + + # 3. Stop services. + try: + update_linstor_satellite_service(start=False) + update_minidrbdcluster_service(start=False) + update_all_ports(open=False) + except Exception as e: + util.SMlog('Error while stopping services: {}'.format(e)) + pass + + return str('True') + + if __name__ == '__main__': XenAPIPlugin.dispatch({ 'prepareSr': prepare_sr, @@ -357,5 +632,7 @@ if __name__ == '__main__': 'getBlockBitmap': get_block_bitmap, 'lockVdi': lock_vdi, 'lsofResource': lsof_resource, - 'hasControllerRunning': has_controller_running + 'hasControllerRunning': has_controller_running, + 'addHost': add_host, + 'removeHost': remove_host }) diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py index 2d5c63ed..6c0d5aa2 100755 --- a/drivers/linstorvolumemanager.py +++ b/drivers/linstorvolumemanager.py @@ -402,6 +402,15 @@ def group_name(self): """ return self._base_group_name + @property + def redundancy(self): + """ + Give the used redundancy. + :return: The redundancy. + :rtype: int + """ + return self._redundancy + @property def volumes(self): """ @@ -1376,6 +1385,51 @@ def invalidate_resource_cache(self): """ self._mark_resource_cache_as_dirty() + def has_node(self, node_name): + """ + Check if a node exists in the LINSTOR database. + :rtype: bool + """ + result = self._linstor.node_list() + error_str = self._get_error_str(result) + if error_str: + raise LinstorVolumeManagerError( + 'Failed to list nodes using `{}`: {}' + .format(node_name, error_str) + ) + return bool(result[0].node(node_name)) + + def create_node(self, node_name, ip): + """ + Create a new node in the LINSTOR database. + :param str node_name: Node name to use. + :param str ip: Host IP to communicate. + """ + result = self._linstor.node_create( + node_name, + linstor.consts.VAL_NODE_TYPE_CMBD, + ip + ) + errors = self._filter_errors(result) + if errors: + error_str = self._get_error_str(errors) + raise LinstorVolumeManagerError( + 'Failed to create node `{}`: {}'.format(node_name, error_str) + ) + + def destroy_node(self, node_name): + """ + Destroy a node in the LINSTOR database. + :param str node_name: Node name to remove. + """ + result = self._linstor.node_delete(node_name) + errors = self._filter_errors(result) + if errors: + error_str = self._get_error_str(errors) + raise LinstorVolumeManagerError( + 'Failed to destroy node `{}`: {}'.format(node_name, error_str) + ) + @classmethod def create_sr( cls, group_name, node_names, ips, redundancy, From 2c85b6a38c1674de14e4572158f29e34f994782d Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Wed, 13 Apr 2022 15:56:42 +0200 Subject: [PATCH 050/133] feat(LinstorVolumeManager): support SR creation with diskless nodes Signed-off-by: Ronan Abhamon --- drivers/linstorvolumemanager.py | 66 +++++++++++++++++++++++++++------ 1 file changed, 55 insertions(+), 11 deletions(-) diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py index 6c0d5aa2..430e080b 100755 --- a/drivers/linstorvolumemanager.py +++ b/drivers/linstorvolumemanager.py @@ -1543,9 +1543,14 @@ def _create_sr( ) # 2. Create storage pool on each node + resource group. + reg_volume_group_not_found = re.compile( + ".*Volume group '.*' not found$" + ) + i = 0 try: # 2.a. Create storage pools. + storage_pool_count = 0 while i < len(node_names): node_name = node_names[i] @@ -1556,17 +1561,35 @@ def _create_sr( driver_pool_name=driver_pool_name ) - error_str = cls._get_error_str(result) - if error_str: - raise LinstorVolumeManagerError( - 'Could not create SP `{}` on node `{}`: {}'.format( - group_name, - node_name, - error_str + errors = linstor.Linstor.filter_api_call_response_errors( + result + ) + if errors: + if len(errors) == 1 and errors[0].is_error( + linstor.consts.FAIL_STOR_POOL_CONFIGURATION_ERROR + ) and reg_volume_group_not_found.match(errors[0].message): + logger( + 'Volume group `{}` not found on `{}`. Ignoring...' + .format(group_name, node_name) ) - ) + cls._destroy_storage_pool(lin, group_name, node_name) + else: + error_str = cls._get_error_str(result) + raise LinstorVolumeManagerError( + 'Could not create SP `{}` on node `{}`: {}' + .format(group_name, node_name, error_str) + ) + else: + storage_pool_count += 1 i += 1 + if not storage_pool_count: + raise LinstorVolumeManagerError( + 'Unable to create SR `{}`: No VG group found'.format( + group_name, + ) + ) + # 2.b. Create resource group. result = lin.resource_group_create( name=group_name, @@ -2345,13 +2368,22 @@ def _create_database_volume( # "Not enough available nodes" # I don't understand why but this command protect against this bug. try: - lin.storage_pool_list_raise(filter_by_stor_pools=[group_name]) + pools = lin.storage_pool_list_raise( + filter_by_stor_pools=[group_name] + ) except Exception as e: raise LinstorVolumeManagerError( 'Failed to get storage pool list before database creation: {}' .format(e) ) + # Ensure we have a correct list of storage pools. + nodes_with_pool = map(lambda pool: pool.node_name, pools.storage_pools) + assert nodes_with_pool # We must have at least one storage pool! + for node_name in nodes_with_pool: + assert node_name in node_names + util.SMlog('Nodes with storage pool: {}'.format(nodes_with_pool)) + # Create the database definition. size = cls.round_up_volume_size(DATABASE_SIZE) cls._check_volume_creation_errors(lin.resource_group_spawn( @@ -2364,14 +2396,26 @@ def _create_database_volume( # Create real resources on the first nodes. resources = [] - for node_name in node_names[:redundancy]: + + diskful_nodes = [] + diskless_nodes = [] + for node_name in node_names: + if node_name in nodes_with_pool: + diskful_nodes.append(node_name) + else: + diskless_nodes.append(node_name) + + assert diskful_nodes + for node_name in diskful_nodes[:redundancy]: + util.SMlog('Create database diskful on {}'.format(node_name)) resources.append(linstor.ResourceData( node_name=node_name, rsc_name=DATABASE_VOLUME_NAME, storage_pool=group_name )) # Create diskless resources on the remaining set. - for node_name in node_names[redundancy:]: + for node_name in diskful_nodes[redundancy:] + diskless_nodes: + util.SMlog('Create database diskless on {}'.format(node_name)) resources.append(linstor.ResourceData( node_name=node_name, rsc_name=DATABASE_VOLUME_NAME, From 0f508f8063fb5b8e3c02f3751fbef06acc8ed049 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Thu, 14 Apr 2022 10:30:23 +0200 Subject: [PATCH 051/133] feat(LinstorSR): add a config var to disable HTTP/NBD servers Signed-off-by: Ronan Abhamon --- drivers/LinstorSR.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index 413c5501..927e4771 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -61,6 +61,13 @@ FORK_LOG_DAEMON = '/opt/xensource/libexec/fork-log-daemon' +# This flag can be disabled to debug the DRBD layer. +# When this config var is False, the HA can only be used under +# specific conditions: +# - Only one heartbeat diskless VDI is present in the pool. +# - The other hearbeat volumes must be diskful and limited to a maximum of 3. +USE_HTTP_NBD_SERVERS = True + # ============================================================================== # TODO: Supports 'VDI_INTRODUCE', 'VDI_RESET_ON_BOOT/2', 'SR_TRIM', @@ -1769,7 +1776,11 @@ def attach(self, sr_uuid, vdi_uuid): self.xenstore_data = {} self.xenstore_data['storage-type'] = LinstorSR.DRIVER_TYPE - if attach_from_config and self.path.startswith('/dev/http-nbd/'): + if ( + USE_HTTP_NBD_SERVERS and + attach_from_config and + self.path.startswith('/dev/http-nbd/') + ): return self._attach_using_http_nbd() if not util.pathexists(self.path): @@ -1934,7 +1945,7 @@ def generate_config(self, sr_uuid, vdi_uuid): # We can't increase this limitation, so we use a NBD/HTTP device # instead. volume_name = self._linstor.get_volume_name(self.uuid) - if volume_name not in [ + if not USE_HTTP_NBD_SERVERS or volume_name not in [ 'xcp-persistent-ha-statefile', 'xcp-persistent-redo-log' ]: if not self.path or not util.pathexists(self.path): From f7e8113c16147161931533cbe36d9dc86bb0677e Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Thu, 14 Apr 2022 15:45:20 +0200 Subject: [PATCH 052/133] feat(LinstorSr): ensure LVM group is activated during SR.attach/create Signed-off-by: Ronan Abhamon --- drivers/LinstorSR.py | 23 +++++++++++++++++------ drivers/linstor-manager | 2 ++ 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index 927e4771..e2d3d783 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -34,6 +34,7 @@ import distutils import errno import functools +import lvutil import os import re import scsiutil @@ -320,6 +321,15 @@ def parse_local_config(ips, node): return (host_ip, ips) + +def activate_lvm_group(group_name): + path = group_name.split('/') + assert path and len(path) <= 2 + try: + lvutil.setActiveVG(path[0], True) + except Exception as e: + util.SMlog('Cannot active VG `{}`: {}'.format(path[0], e)) + # ============================================================================== # Usage example: @@ -673,7 +683,7 @@ def create(self, uuid, size): # Ensure ports are opened and LINSTOR satellites # are activated. In the same time the minidrbdcluster instances # must be stopped. - self._prepare_sr_on_all_hosts(enabled=True) + self._prepare_sr_on_all_hosts(self._group_name, enabled=True) # Create SR. # Throw if the SR already exists. @@ -798,6 +808,7 @@ def attach(self, uuid): 'SRUnavailable', opterr='no such group: {}'.format(self._group_name) ) + activate_lvm_group(self._group_name) @_locked_load def detach(self, uuid): @@ -907,20 +918,20 @@ def _exec_manager_command(self, host_ref, command, args, error): opterr='Plugin {} failed'.format(self.MANAGER_PLUGIN) ) - def _prepare_sr(self, host, enabled): + def _prepare_sr(self, host, group_name, enabled): self._exec_manager_command( host, 'prepareSr' if enabled else 'releaseSr', - {}, + {'groupName': group_name}, 'SRUnavailable' ) - def _prepare_sr_on_all_hosts(self, enabled): + def _prepare_sr_on_all_hosts(self, group_name, enabled): master = util.get_master_ref(self.session) - self._prepare_sr(master, enabled) + self._prepare_sr(master, group_name, enabled) for slave in util.get_all_slaves(self.session): - self._prepare_sr(slave, enabled) + self._prepare_sr(slave, group_name, enabled) def _update_minidrbdcluster(self, host, enabled): self._exec_manager_command( diff --git a/drivers/linstor-manager b/drivers/linstor-manager index 7e34ce65..91731b1d 100755 --- a/drivers/linstor-manager +++ b/drivers/linstor-manager @@ -76,6 +76,8 @@ def update_minidrbdcluster_service(start): def prepare_sr(session, args): try: + LinstorSR.activate_lvm_group(args['groupName']) + update_all_ports(open=True) # We don't want to enable and start minidrbdcluster daemon during # SR creation. From a4e217356f383430b0bcc116718871f647f852c5 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Mon, 25 Apr 2022 14:47:51 +0200 Subject: [PATCH 053/133] feat(linstor-manager): add method to create LinstorSR + to list/destroy DRBD volumes Signed-off-by: Ronan Abhamon --- drivers/linstor-manager | 179 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 168 insertions(+), 11 deletions(-) diff --git a/drivers/linstor-manager b/drivers/linstor-manager index 91731b1d..7893ebc6 100755 --- a/drivers/linstor-manager +++ b/drivers/linstor-manager @@ -23,6 +23,7 @@ sys.path[0] = '/opt/xensource/sm/' import base64 import distutils.util import socket +import XenAPI import XenAPIPlugin from linstorjournaler import LinstorJournaler @@ -30,9 +31,13 @@ from linstorvolumemanager import get_controller_uri, LinstorVolumeManager from lock import Lock import json import LinstorSR +import re import util import vhdutil +BACKING_DISK_RE = re.compile('^/dev/([^/]+)/(?:[^/]+)$') +LVM_PLUGIN = 'lvm' +THIN_POOL = 'thin_pool' FIREWALL_PORT_SCRIPT = '/etc/xapi.d/plugins/firewall-port' LINSTOR_PORTS = [3366, 3370, 3376, 3377, '7000:8000', 8076, 8077] @@ -55,14 +60,6 @@ def update_all_ports(open): update_port(port, open) -def stop_service(name): - args = ('systemctl', 'stop', name) - (ret, out, err) = util.doexec(args) - if ret == 0: - return - raise Exception('Failed to stop {}: {} {}'.format(name, out, err)) - - def update_linstor_satellite_service(start): util.enable_and_start_service('linstor-satellite', start) @@ -71,6 +68,111 @@ def update_minidrbdcluster_service(start): util.enable_and_start_service('minidrbdcluster', start) +def exec_create_sr(session, name, description, disks, volume_group, redundancy, thin, force): + disks = json.loads(disks) + disk_hostnames = disks.keys() + + # Create volumes. + hosts = session.xenapi.host.get_all_records() + hostnames = [] + for host_ref, host_record in hosts.items(): + hostname = host_record['hostname'] + if hostname not in disk_hostnames: + continue + + if force: + try: + session.xenapi.host.call_plugin( + host_ref, LVM_PLUGIN, 'destroy_volume_group', { + 'vg_name': volume_group, + 'force': 'True' + } + ) + except Exception as e: + try: + response = session.xenapi.host.call_plugin( + host_ref, LVM_PLUGIN, 'list_volume_groups', { + 'vg_name': volume_group + } + ) + if response != '{}': + raise e + except Exception: + raise e + + host_devices = ','.join(disks[hostname]) + session.xenapi.host.call_plugin( + host_ref, LVM_PLUGIN, 'create_physical_volume', { + 'devices': host_devices, + 'force': str(force) + } + ) + + session.xenapi.host.call_plugin( + host_ref, LVM_PLUGIN, 'create_volume_group', { + 'vg_name': volume_group, + 'devices': host_devices + } + ) + + if thin: + session.xenapi.host.call_plugin( + host_ref, LVM_PLUGIN, 'create_thin_pool', { + 'vg_name': volume_group, + 'lv_name': THIN_POOL + } + ) + + # Create SR. + master_ref = session.xenapi.pool.get_all_records().values()[0]['master'] + + device_config = { + 'redundancy': redundancy, + 'provisioning': 'thin' if thin else 'thick', + 'group-name': '{}/{}'.format(volume_group, THIN_POOL) if thin else volume_group, + 'hosts': ','.join(hostnames), + 'monitor-db-quorum': str(len(hostnames) > 2) + } + sr_ref = session.xenapi.SR.create( + master_ref, device_config, '0', name, description, 'linstor', '', True, {} + ) + return session.xenapi.SR.get_uuid(sr_ref) + + +def get_drbd_volumes(volume_group=None): + drbd_volumes = {} + (ret, stdout, stderr) = util.doexec(['drbdsetup', 'show', '--json']) + if ret: + raise Exception('Failed to get JSON object: {}'.format(stderr)) + + config = json.loads(stdout) + for resource in config: + for volume in resource['_this_host']['volumes']: + backing_disk = volume['backing-disk'] + match = BACKING_DISK_RE.match(backing_disk) + if not match: + continue + + cur_volume_group = match.groups()[0] + if volume_group and cur_volume_group != volume_group: + continue + + minor = int(volume['device_minor']) + if cur_volume_group in drbd_volumes: + drbd_volumes[cur_volume_group].append(minor) + else: + drbd_volumes[cur_volume_group] = [minor] + return drbd_volumes + + +def force_destroy_drbd_volume(minor): + (ret, stdout, stderr) = util.doexec(['drbdsetup', 'detach', minor, '--force']) + if ret: + raise Exception('Failed to detach volume: {}'.format(stderr)) + (ret, stdout, stderr) = util.doexec(['drbdsetup', 'del-minor', minor]) + if ret: + raise Exception('Failed to destroy volume: {}'.format(stderr)) + # ------------------------------------------------------------------------------ @@ -169,8 +271,8 @@ def destroy(session, args): linstor.destroy() return str(True) except Exception as e: - stop_service('linstor-controller') - stop_service('var-lib-linstor.service') + util.stop_service('linstor-controller') + util.stop_service('var-lib-linstor.service') util.SMlog('linstor-manager:destroy error: {}'.format(e)) return str(False) @@ -615,6 +717,57 @@ def remove_host(session, args): return str('True') +def create_sr(session, args): + try: + name = args['name'] + description = args.get('description') or '' + disks = json.loads(args['disks']) + volume_group = args['volume_group'] + redundancy = int(args['redundancy']) + thin = distutils.util.strtobool(args.get('thin') or '0') + force = distutils.util.strtobool(args.get('force') or '0') + return json.dumps(exec_create_sr( + session, name, description, disks, volume_group, redundancy, thin, force + )) + except Exception as e: + util.SMlog('linstor-manager:create_sr error: {}'.format(e)) + raise + + +def list_drbd_volumes(session, args): + try: + volume_group = args.get('volume_group') + return json.dumps(get_drbd_volumes(volume_group)) + except Exception as e: + util.SMlog('linstor-manager:list_drbd_volumes error: {}'.format(e)) + raise + + +def destroy_drbd_volume(session, args): + try: + minor = args.get('minor') + if not minor: + raise Exception('Cannot destroy DRBD volume without minor.') + force_destroy_drbd_volume(minor) + return str(True) + except Exception as e: + util.SMlog('linstor-manager:destroy_drbd_volume error: {}'.format(e)) + return str(False) + + +def destroy_drbd_volumes(session, args): + try: + volume_group = args.get('volume_group') + if not volume_group: + raise Exception('Cannot destroy DRBD volumes without volume group.') + for minor in get_drbd_volumes(volume_group).get(volume_group, []): + force_destroy_drbd_volume(str(minor)) + return str(True) + except Exception as e: + util.SMlog('linstor-manager:destroy_drbd_volumes error: {}'.format(e)) + return str(False) + + if __name__ == '__main__': XenAPIPlugin.dispatch({ 'prepareSr': prepare_sr, @@ -636,5 +789,9 @@ if __name__ == '__main__': 'lsofResource': lsof_resource, 'hasControllerRunning': has_controller_running, 'addHost': add_host, - 'removeHost': remove_host + 'removeHost': remove_host, + 'createSr': create_sr, + 'listDrbdVolumes': list_drbd_volumes, + 'destroyDrbdVolume': destroy_drbd_volume, + 'destroyDrbdVolumes': destroy_drbd_volumes }) From 52fefb7e4ec2d27a46afbc60f67edc1fcb94f267 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Tue, 26 Apr 2022 11:20:08 +0200 Subject: [PATCH 054/133] fix(LinstorSR): always set vdi_path in generate_config If the volume of a generated config is not related to HTTP/NBD and if we already have a path to the resource, the VDI path is never written to the config. So the config can't be used to attach the VDI... Signed-off-by: Ronan Abhamon --- drivers/LinstorSR.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index e2d3d783..1855e3d9 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -1970,7 +1970,7 @@ def generate_config(self, sr_uuid, vdi_uuid): if not available: raise xs_errors.XenError('VDIUnavailable') - resp['vdi_path'] = self.path + resp['vdi_path'] = self.path else: # Axiom: DRBD device is present on at least one host. resp['vdi_path'] = '/dev/http-nbd/' + volume_name From 2a85b1e3132c9155c5d9350964a94bcfdf31b82b Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Fri, 13 May 2022 14:35:57 +0200 Subject: [PATCH 055/133] fix(minidrbdcluster): supports new properties like `force-io-failures` Signed-off-by: Ronan Abhamon --- scripts/minidrbdcluster | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/minidrbdcluster b/scripts/minidrbdcluster index a04b6c1c..fb4de09b 100755 --- a/scripts/minidrbdcluster +++ b/scripts/minidrbdcluster @@ -11,15 +11,15 @@ DRBDADM_OPEN_FAILED_RE = re.compile( ) MAY_PROMOT_RE = re.compile( '(?:exists|change) resource name:((?:\\w|-)+) ' - '(?:\\w+\\:\\w+ )*may_promote:(yes|no) promotion_score:(\\d+)' + '(?:(?:\\w|-)+\\:(?:\\w|-)+ )*may_promote:(yes|no) promotion_score:(\\d+)' ) PEER_ROLE_RE = re.compile( '(?:exists|change) connection name:((?:\\w|-)+) peer-node-id:(?:\\d+) ' - 'conn-name:(\\w+) (?:\\w+\\:\\w+ )*role:(Primary|Secondary|Unknown)' + 'conn-name:((?:\\w|-)+) (?:(?:\\w|-)+\\:(?:\\w|-)+ )*role:(Primary|Secondary|Unknown)' ) HAVE_QUORUM_RE = re.compile( '(?:exists|change) device name:((?:\\w|-)+) ' - '(?:\\w+\\:\\w+ )*quorum:(yes|no)' + '(?:(?:\\w|-)+\\:(?:\\w|-)+ )*quorum:(yes|no)' ) From 23d60c6a991a95c0db87b58caac6fc5814f4d20f Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Wed, 18 May 2022 17:28:33 +0200 Subject: [PATCH 056/133] fix(LinstorSR): enabled/disable minidrbcluster with fixed order Ensure we disable minidrbdcluster during SR destruction on all hosts with a deterministic execution to ensure linstor-controller is never restarted on another host. It was possible before this patch because the host of the minidrbcluster that was running the controller could be stopped before the others. Now the primary service is stopped last. Signed-off-by: Ronan Abhamon --- drivers/LinstorSR.py | 38 +++++++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index 1855e3d9..57280e36 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -751,7 +751,9 @@ def delete(self, uuid): ) try: - self._update_minidrbdcluster_on_all_hosts(enabled=False) + self._update_minidrbdcluster_on_all_hosts( + controller_node_name=node_name, enabled=False + ) args = { 'groupName': self._group_name, @@ -761,7 +763,9 @@ def delete(self, uuid): ) except Exception as e: try: - self._update_minidrbdcluster_on_all_hosts(enabled=True) + self._update_minidrbdcluster_on_all_hosts( + controller_node_name=node_name, enabled=True + ) except Exception as e2: util.SMlog( 'Failed to restart minidrbdcluster after destroy fail: {}' @@ -941,12 +945,32 @@ def _update_minidrbdcluster(self, host, enabled): 'SRUnavailable' ) - def _update_minidrbdcluster_on_all_hosts(self, enabled): - master = util.get_master_ref(self.session) - self._update_minidrbdcluster(master, enabled) + def _update_minidrbdcluster_on_all_hosts( + self, enabled, controller_node_name=None + ): + controller_host = None + secondary_hosts = [] - for slave in util.get_all_slaves(self.session): - self._update_minidrbdcluster(slave, enabled) + hosts = self.session.xenapi.host.get_all_records() + for host_ref, host_rec in hosts.iteritems(): + if controller_node_name == host_rec['hostname']: + controller_host = host_ref + else: + secondary_hosts.append(host_ref) + + if enabled and controller_host: + # If enabled is true, we try to start the controller on the desired + # node name first. + self._update_minidrbdcluster(controller_host, enabled) + + for host in secondary_hosts: + self._update_minidrbdcluster(host, enabled) + + if not enabled and controller_host: + # If enabled is false, we disable the minidrbdcluster service of + # the controller host last. Why? Otherwise the linstor-controller + # of other nodes can be started, and we don't want that. + self._update_minidrbdcluster(controller_host, enabled) # -------------------------------------------------------------------------- # Metadata. From 2d69d74a50da3030ff5dfa99d81499685dad6070 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Tue, 31 May 2022 14:01:45 +0200 Subject: [PATCH 057/133] fix(linstor-manager): change linstor satellite start behavior Ensure we don't have an invalid cache used by a satellite: - We found an issue with a new added disk which used a volume group name formerly involved by another disk. To avoid this kind of problem, we always restart the satellite. Signed-off-by: Ronan Abhamon --- drivers/linstor-manager | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/linstor-manager b/drivers/linstor-manager index 7893ebc6..c6d622f2 100755 --- a/drivers/linstor-manager +++ b/drivers/linstor-manager @@ -61,7 +61,16 @@ def update_all_ports(open): def update_linstor_satellite_service(start): - util.enable_and_start_service('linstor-satellite', start) + service = 'linstor-satellite' + + # Stop services in all cases first. + # Ensure we don't have an invalid cache used by a satellite. + # (We found an issue with a new added disk which used a volume group name + # formerly involved by another disk. To avoid this kind of problem, we + # always restart the satellite.) + util.enable_and_start_service(service, False) + if start: + util.enable_and_start_service(service, True) def update_minidrbdcluster_service(start): From d831cac842c6410020203422aaeb803912877f01 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Thu, 2 Jun 2022 09:04:28 +0200 Subject: [PATCH 058/133] Fix is_open call for LinstorSR 1. Ensure LinstorSR driver is imported in `_is_open` definition to register it in the driver list. Otherwise this function always fails with a SRUnknownType exception. 2. Fetch the dconf of the target SR to retrieve VDI path, i.e. we can't use fake params like other drivers, we must have a real LINSTOR connection to read in the DB the volume location. Signed-off-by: Ronan Abhamon --- drivers/LinstorSR.py | 10 +++++++++- drivers/on_slave.py | 17 ++++++++++++++++- tests/test_on_slave.py | 10 +++++++++- 3 files changed, 34 insertions(+), 3 deletions(-) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index 57280e36..e5f6f85c 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -445,7 +445,11 @@ def wrapped_method(self, *args, **kwargs): def load(self, *args, **kwargs): if not self._has_session: if self.srcmd.cmd in ( - 'vdi_attach_from_config', 'vdi_detach_from_config' + 'vdi_attach_from_config', + 'vdi_detach_from_config', + # When on-slave (is_open) is executed we have an + # empty command. + None ): def create_linstor(uri, attempt_count=30): self._linstor = LinstorVolumeManager( @@ -482,6 +486,10 @@ def connect(): controller_uri, self._group_name, logger=util.SMlog ) + if self.srcmd.cmd is None: + # Only useful on on-slave plugin (is_open). + self._vhdutil = LinstorVhdUtil(self.session, self._linstor) + return wrapped_method(self, *args, **kwargs) if not self._is_master: diff --git a/drivers/on_slave.py b/drivers/on_slave.py index 534e6c90..0633cffa 100755 --- a/drivers/on_slave.py +++ b/drivers/on_slave.py @@ -77,6 +77,7 @@ def _is_open(session, args): import CephFSSR import EXTSR import GlusterFSSR + import LinstorSR import LVHDSR import MooseFSSR import NFSSR @@ -107,8 +108,22 @@ def _is_open(session, args): } cmd.params = {"command": None} + sr_uuid = srRec["uuid"] + + # Another ugly piece of code to load a real Linstor SR, otherwise + # we can't fetch the VDI path. + if srType == 'linstor': + host_ref = util.get_this_host_ref(session) + sr_ref = session.xenapi.SR.get_by_uuid(sr_uuid) + + pbd = util.find_my_pbd(session, host_ref, sr_ref) + if pbd is None: + raise util.SMException('Failed to find Linstor PBD') + + cmd.dconf = session.xenapi.PBD.get_device_config(pbd) + driver = SR.driver(srType) - sr = driver(cmd, srRec["uuid"]) + sr = driver(cmd, sr_uuid) vdi = sr.vdi(vdiUuid) tapdisk = blktap2.Tapdisk.find_by_path(vdi.path) util.SMlog("Tapdisk for %s: %s" % (vdi.path, tapdisk)) diff --git a/tests/test_on_slave.py b/tests/test_on_slave.py index 54ebcd38..4c12d903 100644 --- a/tests/test_on_slave.py +++ b/tests/test_on_slave.py @@ -13,7 +13,15 @@ class Test_on_slave_is_open(unittest.TestCase): - MOCK_IMPORTS = ['SRCommand', 'SR', 'NFSSR', 'EXTSR', 'LVHDSR', 'blktap2'] + MOCK_IMPORTS = [ + 'SRCommand', + 'SR', + 'NFSSR', + 'EXTSR', + 'LVHDSR', + 'LinstorSR', + 'blktap2' + ] def fake_import(self, name, *args): print 'Asked to import {}'.format(name) From 199baaa6e6910b8196bac3a8b1db5c6dd1fbc2d1 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Thu, 2 Jun 2022 09:28:32 +0200 Subject: [PATCH 059/133] fix(linstorvhdutil): fix boolean params of `check` call `ignoreMissingFooter` and `fast` must be string types to be used with XAPI plugin API. Signed-off-by: Ronan Abhamon --- drivers/linstor-manager | 6 ++++-- drivers/linstorvhdutil.py | 5 ++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/linstor-manager b/drivers/linstor-manager index c6d622f2..63c0e3ed 100755 --- a/drivers/linstor-manager +++ b/drivers/linstor-manager @@ -289,8 +289,10 @@ def destroy(session, args): def check(session, args): try: device_path = args['devicePath'] - ignore_missing_footer = args['ignoreMissingFooter'] - fast = args['fast'] + ignore_missing_footer = distutils.util.strtobool( + args['ignoreMissingFooter'] + ) + fast = distutils.util.strtobool(args['fast']) return str(vhdutil.check(device_path, ignore_missing_footer, fast)) except Exception as e: util.SMlog('linstor-manager:check error: {}'.format(e)) diff --git a/drivers/linstorvhdutil.py b/drivers/linstorvhdutil.py index 9ba0ac3b..f3d98705 100644 --- a/drivers/linstorvhdutil.py +++ b/drivers/linstorvhdutil.py @@ -111,7 +111,10 @@ def __init__(self, session, linstor): # -------------------------------------------------------------------------- def check(self, vdi_uuid, ignore_missing_footer=False, fast=False): - kwargs = {'ignoreMissingFooter': ignore_missing_footer, 'fast': fast} + kwargs = { + 'ignoreMissingFooter': str(ignore_missing_footer), + 'fast': str(fast) + } return self._check(vdi_uuid, **kwargs) @linstorhostcall(vhdutil.check, 'check') From 853d85ccb6fbfe7a0291cc92a41f7e00deafeeb6 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Thu, 30 Jun 2022 17:09:51 +0200 Subject: [PATCH 060/133] feat(linstor-manager): robustify exec_create_sr - Use lvm.py XCP-ng xapi plugins instead of lvm (old name) - Check arguments to create the SR - Fix param types given to SR.create - lsof_resource use verbose output if there is a lock or problem - Remove useless `force` param on remove_host Signed-off-by: Ronan Abhamon --- drivers/linstor-manager | 112 ++++++++++++++++++++++++++++------------ 1 file changed, 78 insertions(+), 34 deletions(-) diff --git a/drivers/linstor-manager b/drivers/linstor-manager index 63c0e3ed..2930a9ed 100755 --- a/drivers/linstor-manager +++ b/drivers/linstor-manager @@ -36,7 +36,7 @@ import util import vhdutil BACKING_DISK_RE = re.compile('^/dev/([^/]+)/(?:[^/]+)$') -LVM_PLUGIN = 'lvm' +LVM_PLUGIN = 'lvm.py' THIN_POOL = 'thin_pool' FIREWALL_PORT_SCRIPT = '/etc/xapi.d/plugins/firewall-port' @@ -77,17 +77,16 @@ def update_minidrbdcluster_service(start): util.enable_and_start_service('minidrbdcluster', start) -def exec_create_sr(session, name, description, disks, volume_group, redundancy, thin, force): - disks = json.loads(disks) +def exec_create_sr(session, name, description, disks, volume_group, redundancy, provisioning, force): disk_hostnames = disks.keys() + thin = provisioning == 'thin' # Create volumes. hosts = session.xenapi.host.get_all_records() hostnames = [] for host_ref, host_record in hosts.items(): hostname = host_record['hostname'] - if hostname not in disk_hostnames: - continue + hostnames.append(hostname) if force: try: @@ -109,10 +108,24 @@ def exec_create_sr(session, name, description, disks, volume_group, redundancy, except Exception: raise e - host_devices = ','.join(disks[hostname]) + if hostname not in disk_hostnames or not disks[hostname]: + if force or session.xenapi.host.call_plugin( + host_ref, LVM_PLUGIN, 'list_volume_groups', { + 'vg_name': volume_group + } + ) == '{}': + continue + raise Exception('Volume group should not exist on `{}`, you must remove it manually'.format(hostname)) + + host_disks = disks[hostname] + if type(host_disks) is list: + host_disks = ','.join(disks[hostname]) + else: + raise Exception('Disk value of `{}` must be a disk list'.format(hostname)) + session.xenapi.host.call_plugin( host_ref, LVM_PLUGIN, 'create_physical_volume', { - 'devices': host_devices, + 'devices': host_disks, 'force': str(force) } ) @@ -120,7 +133,7 @@ def exec_create_sr(session, name, description, disks, volume_group, redundancy, session.xenapi.host.call_plugin( host_ref, LVM_PLUGIN, 'create_volume_group', { 'vg_name': volume_group, - 'devices': host_devices + 'devices': host_disks } ) @@ -132,20 +145,20 @@ def exec_create_sr(session, name, description, disks, volume_group, redundancy, } ) - # Create SR. - master_ref = session.xenapi.pool.get_all_records().values()[0]['master'] - - device_config = { - 'redundancy': redundancy, - 'provisioning': 'thin' if thin else 'thick', - 'group-name': '{}/{}'.format(volume_group, THIN_POOL) if thin else volume_group, - 'hosts': ','.join(hostnames), - 'monitor-db-quorum': str(len(hostnames) > 2) - } - sr_ref = session.xenapi.SR.create( - master_ref, device_config, '0', name, description, 'linstor', '', True, {} - ) - return session.xenapi.SR.get_uuid(sr_ref) + # Create SR. + master_ref = session.xenapi.pool.get_all_records().values()[0]['master'] + + device_config = { + 'redundancy': str(redundancy), + 'provisioning': 'thin' if thin else 'thick', + 'group-name': '{}/{}'.format(volume_group, THIN_POOL) if thin else volume_group, + 'hosts': ','.join(hostnames), + 'monitor-db-quorum': str(len(hostnames) > 2) + } + sr_ref = session.xenapi.SR.create( + master_ref, device_config, '0', name, description, 'linstor', '', True, {} + ) + return session.xenapi.SR.get_uuid(sr_ref) def get_drbd_volumes(volume_group=None): @@ -435,13 +448,13 @@ def lock_vdi(session, args): def lsof_resource(session, args): try: drbd_path = args['drbdPath'] - (ret, stdout, stderr) = util.doexec(['lsof', drbd_path]) + (ret, stdout, stderr) = util.doexec(['lsof', '-V', drbd_path]) if ret == 0: return 'DRBD resource `{}` is open: {}'.format( - drbd_path, stdout + drbd_path, stdout.rstrip() ) return '`lsof` on DRBD resource `{}` returned {}: {}'.format( - drbd_path, ret, stderr + drbd_path, ret, stdout.rstrip() ) except Exception as e: util.SMlog('linstor-manager:lsof_drbd error: {}'.format(e)) @@ -645,7 +658,6 @@ def add_host(session, args): def remove_host(session, args): group_name = args['groupName'] - force = args.get('force') or False # 1. Find SRs and PBDs. srs = dict() @@ -730,16 +742,48 @@ def remove_host(session, args): def create_sr(session, args): try: - name = args['name'] + # Use a complex parsing contrary to the other functions because + # this helper is a public method and is not easy to use. + name = args.get('name') + if not name: + raise Exception('`name` is empty') + description = args.get('description') or '' - disks = json.loads(args['disks']) - volume_group = args['volume_group'] - redundancy = int(args['redundancy']) - thin = distutils.util.strtobool(args.get('thin') or '0') + + disks = args.get('disks') + if not disks: + raise Exception('`disks` is empty') + try: + disks = json.loads(disks) + except Exception as e: + raise Exception('failed to decode `disks`: {}'.format(e)) + if type(disks) is not dict: + raise Exception('`disks` must be a JSON object') + + volume_group = args.get('volume_group') + if not volume_group: + raise Exception('`volume_group` is empty') + + redundancy = args.get('redundancy') + if not redundancy: + raise Exception('`redundancy` is empty') + + try: + redundancy = int(redundancy) + except Exception: + raise Exception('`redundancy` is not a number') + + provisioning = args.get('provisioning') + if not provisioning: + provisioning = 'thin' + elif provisioning != 'thin' and provisioning != 'thick': + raise Exception('unsupported provisioning') + force = distutils.util.strtobool(args.get('force') or '0') - return json.dumps(exec_create_sr( - session, name, description, disks, volume_group, redundancy, thin, force - )) + + return exec_create_sr( + session, name, description, disks, volume_group, redundancy, provisioning, force + ) except Exception as e: util.SMlog('linstor-manager:create_sr error: {}'.format(e)) raise From 0582609fdf89a553e416577211fce941ada8749f Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Fri, 8 Jul 2022 14:52:25 +0200 Subject: [PATCH 061/133] fix(cleanup): print LINSTOR VDI UUID if error during info loading (not SR UUID) Signed-off-by: Ronan Abhamon --- drivers/cleanup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cleanup.py b/drivers/cleanup.py index 0399389e..77b38c2b 100755 --- a/drivers/cleanup.py +++ b/drivers/cleanup.py @@ -2969,7 +2969,7 @@ def _load_vdi_info(self): except Exception as e: Util.log( ' [VDI {}: failed to load VDI info]: {}' - .format(self.uuid, e) + .format(vdi_uuid, e) ) info = vhdutil.VHDInfo(vdi_uuid) info.error = 1 From 1c9105353ef6469e804c3f0f5dc32b715e12956e Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Thu, 21 Jul 2022 11:39:20 +0200 Subject: [PATCH 062/133] feat(cleanup): raise and dump DRBD openers in case of bad coalesce Signed-off-by: Ronan Abhamon --- drivers/cleanup.py | 24 ++++++-------------- drivers/linstor-manager | 40 ++++++++++++++++++++++++++++++++- drivers/linstorvolumemanager.py | 24 ++++++++++++++++++++ 3 files changed, 70 insertions(+), 18 deletions(-) diff --git a/drivers/cleanup.py b/drivers/cleanup.py index 77b38c2b..937296b1 100755 --- a/drivers/cleanup.py +++ b/drivers/cleanup.py @@ -3074,27 +3074,17 @@ def _finishInterruptedCoalesceLeaf(self, childUuid, parentUuid): def _checkSlaves(self, vdi): try: - states = self._linstor.get_usage_states(vdi.uuid) - for node_name, state in states.items(): - self._checkSlave(node_name, vdi, state) + all_openers = self._linstor.get_volume_openers(vdi.uuid) + for openers in all_openers.itervalues(): + for opener in openers.values(): + if opener['process-name'] != 'tapdisk': + raise util.SMException( + 'VDI {} is in use: {}'.format(vdi.uuid, all_openers) + ) except LinstorVolumeManagerError as e: if e.code != LinstorVolumeManagerError.ERR_VOLUME_NOT_EXISTS: raise - @staticmethod - def _checkSlave(node_name, vdi, state): - # If state is None, LINSTOR doesn't know the host state - # (bad connection?). - if state is None: - raise util.SMException( - 'Unknown state for VDI {} on {}'.format(vdi.uuid, node_name) - ) - - if state: - raise util.SMException( - 'VDI {} is in use on {}'.format(vdi.uuid, node_name) - ) - ################################################################################ # diff --git a/drivers/linstor-manager b/drivers/linstor-manager index 2930a9ed..81789e7a 100755 --- a/drivers/linstor-manager +++ b/drivers/linstor-manager @@ -823,6 +823,43 @@ def destroy_drbd_volumes(session, args): return str(False) +def get_drbd_openers(session, args): + try: + resource_name = args.get('resourceName') + volume = args.get('volume') + if not resource_name or volume is None: + raise Exception('Cannot get DRBD openers without resource name and/or volume.') + + path = '/sys/kernel/debug/drbd/resources/{}/volumes/{}/openers'.format( + resource_name, volume + ) + + with open(path, 'r') as openers: + # Not a big cost, so read all lines directly. + lines = openers.readlines() + + result = {} + + opener_re = re.compile('(.*)\\s+([0-9]+)\\s+([0-9]+)') + for line in lines: + match = opener_re.match(line) + assert match + + groups = match.groups() + process_name = groups[0] + pid = groups[1] + open_duration_ms = groups[2] + result[pid] = { + 'process-name': process_name, + 'open-duration': open_duration_ms + } + + return json.dumps(result) + except Exception as e: + util.SMlog('linstor-manager:get_drbd_openers error: {}'.format(e)) + raise + + if __name__ == '__main__': XenAPIPlugin.dispatch({ 'prepareSr': prepare_sr, @@ -848,5 +885,6 @@ if __name__ == '__main__': 'createSr': create_sr, 'listDrbdVolumes': list_drbd_volumes, 'destroyDrbdVolume': destroy_drbd_volume, - 'destroyDrbdVolumes': destroy_drbd_volumes + 'destroyDrbdVolumes': destroy_drbd_volumes, + 'getDrbdOpeners': get_drbd_openers }) diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py index 430e080b..d17845b5 100755 --- a/drivers/linstorvolumemanager.py +++ b/drivers/linstorvolumemanager.py @@ -1048,6 +1048,30 @@ def get_usage_states(self, volume_uuid): return states + def get_volume_openers(self, volume_uuid): + """ + Get openers of a volume. + :param str volume_uuid: The volume uuid to monitor. + :return: A dictionnary that contains openers. + :rtype: dict(str, obj) + """ + + PLUGIN_CMD = 'getDrbdOpeners' + + openers = {} + + session = util.get_localAPI_session() + hosts = session.xenapi.host.get_all_records() + for host_ref, host_record in hosts.items(): + openers[host_record['hostname']] = json.loads( + session.xenapi.host.call_plugin(host_ref, PLUGIN, PLUGIN_CMD, { + 'resourceName': self.get_volume_name(volume_uuid), + 'volume': '0' + }) + ) + + return openers + def get_volumes_with_name(self): """ Give a volume dictionnary that contains names actually owned. From 01b4da52840ecdd92a13817cfb5ba670d67b7b59 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Fri, 22 Jul 2022 10:26:20 +0200 Subject: [PATCH 063/133] feat(linstorvhdutil): trace DRBD openers in case of EROFS errors Signed-off-by: Ronan Abhamon --- drivers/linstor-manager | 8 +++++++ drivers/linstorvhdutil.py | 48 +++++++++++++++++++++++++++++++-------- 2 files changed, 47 insertions(+), 9 deletions(-) diff --git a/drivers/linstor-manager b/drivers/linstor-manager index 81789e7a..9022499f 100755 --- a/drivers/linstor-manager +++ b/drivers/linstor-manager @@ -868,6 +868,13 @@ if __name__ == '__main__': 'attach': attach, 'detach': detach, 'destroy': destroy, + + # vhdutil wrappers called by linstorvhdutil. + # Note: When a VHD is open in RO mode (so for all vhdutil getters), + # the LVM layer is used directly to bypass DRBD verifications. + # In this case there can't be EROFS errors. + # Note 2: We assume linstorvhdutil executes remote calls on diskful + # DRBDs, otherwise we still have EROFS errors... 'check': check, 'getVHDInfo': get_vhd_info, 'hasParent': has_parent, @@ -877,6 +884,7 @@ if __name__ == '__main__': 'getDepth': get_depth, 'getKeyHash': get_key_hash, 'getBlockBitmap': get_block_bitmap, + 'lockVdi': lock_vdi, 'lsofResource': lsof_resource, 'hasControllerRunning': has_controller_running, diff --git a/drivers/linstorvhdutil.py b/drivers/linstorvhdutil.py index f3d98705..d6a21c26 100644 --- a/drivers/linstorvhdutil.py +++ b/drivers/linstorvhdutil.py @@ -26,6 +26,36 @@ MANAGER_PLUGIN = 'linstor-manager' +def call_vhd_util(linstor, func, device_path, *args, **kwargs): + try: + return func(device_path, *args, **kwargs) + except util.CommandException as e: + # Raise if we don't have a lock on the volume on another host. + if e.code != errno.EROFS: + raise + + # Volume is locked on a host, find openers. + e_with_openers = None + try: + volume_uuid = linstor.get_volume_uuid_from_device_path( + device_path + ) + e_with_openers = util.CommandException( + e.code, + e.cmd, + e.reason + ' (openers: {})'.format( + linstor.get_volume_openers(volume_uuid) + ) + ) + except Exception as illformed_e: + raise util.CommandException( + e.code, + e.cmd, + e.reason + ' (unable to get openers: {})'.format(illformed_e) + ) + raise e_with_openers # pylint: disable = E0702 + + def linstorhostcall(local_method, remote_method): def decorated(func): def wrapper(*args, **kwargs): @@ -46,7 +76,7 @@ def wrapper(*args, **kwargs): try: if not in_use or socket.gethostname() in node_names: - return local_method(device_path, *args[2:], **kwargs) + return call_vhd_util(self._linstor, local_method, device_path, *args[2:], **kwargs) except util.CommandException as e: # EMEDIUMTYPE constant (124) is not available in python2. if e.code != errno.EROFS and e.code != 124: @@ -177,35 +207,35 @@ def get_block_bitmap(self, vdi_uuid, **kwargs): @linstormodifier() def create(self, path, size, static, msize=0): - return vhdutil.create(path, size, static, msize) + return call_vhd_util(self._linstor, vhdutil.create, path, size, static, msize) @linstormodifier() def set_size_virt_fast(self, path, size): - return vhdutil.setSizeVirtFast(path, size) + return call_vhd_util(self._linstor, vhdutil.setSizeVirtFast, path, size) @linstormodifier() def set_size_phys(self, path, size, debug=True): - return vhdutil.setSizePhys(path, size, debug) + return call_vhd_util(self._linstor, vhdutil.setSizePhys, path, size, debug) @linstormodifier() def set_parent(self, path, parentPath, parentRaw): - return vhdutil.setParent(path, parentPath, parentRaw) + return call_vhd_util(self._linstor, vhdutil.setParent, path, parentPath, parentRaw) @linstormodifier() def set_hidden(self, path, hidden=True): - return vhdutil.setHidden(path, hidden) + return call_vhd_util(self._linstor, vhdutil.setHidden, path, hidden) @linstormodifier() def set_key(self, path, key_hash): - return vhdutil.setKey(path, key_hash) + return call_vhd_util(self._linstor, vhdutil.setKey, path, key_hash) @linstormodifier() def kill_data(self, path): - return vhdutil.killData(path) + return call_vhd_util(self._linstor, vhdutil.killData, path) @linstormodifier() def snapshot(self, path, parent, parentRaw, msize=0, checkEmpty=True): - return vhdutil.snapshot(path, parent, parentRaw, msize, checkEmpty) + return call_vhd_util(self._linstor, vhdutil.snapshot, path, parent, parentRaw, msize, checkEmpty) # -------------------------------------------------------------------------- # Helpers. From 2dfe7f41dd580c7cae697df654eeddfa211cdb4d Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Fri, 29 Jul 2022 17:25:48 +0200 Subject: [PATCH 064/133] fix(linstorvolumemanager): compute correctly size in allocated_volume_size Remove replication count in computation. Signed-off-by: Ronan Abhamon --- drivers/linstorvolumemanager.py | 59 +++++++++++++-------------------- 1 file changed, 23 insertions(+), 36 deletions(-) diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py index d17845b5..3806cc91 100755 --- a/drivers/linstorvolumemanager.py +++ b/drivers/linstorvolumemanager.py @@ -481,28 +481,6 @@ def min_physical_size(self): size = current_size return (size or 0) * 1024 - @property - def total_volume_size(self): - """ - Give the sum of all created volumes. The place count is used. - :return: The physical required size to use the volumes. - :rtype: int - """ - - size = 0 - for resource in self._get_resource_cache().resources: - for volume in resource.volumes: - # We ignore diskless pools of the form "DfltDisklessStorPool". - if volume.storage_pool_name == self._group_name: - current_size = volume.usable_size - if current_size < 0: - raise LinstorVolumeManagerError( - 'Failed to get usable size of `{}` on `{}`' - .format(resource.name, volume.storage_pool_name) - ) - size += current_size - return size * 1024 - @property def allocated_volume_size(self): """ @@ -514,25 +492,34 @@ def allocated_volume_size(self): :rtype: int """ - size = 0 + # Paths: /res_name/vol_number/size + sizes = {} + for resource in self._get_resource_cache().resources: - volume_size = None + if resource.name not in sizes: + current = sizes[resource.name] = {} + else: + current = sizes[resource.name] + for volume in resource.volumes: # We ignore diskless pools of the form "DfltDisklessStorPool". - if volume.storage_pool_name == self._group_name: - current_size = volume.allocated_size - if current_size < 0: - raise LinstorVolumeManagerError( - 'Failed to get allocated size of `{}` on `{}`' - .format(resource.name, volume.storage_pool_name) - ) + if volume.storage_pool_name != self._group_name: + continue + + current_size = volume.allocated_size + if current_size < 0: + raise LinstorVolumeManagerError( + 'Failed to get allocated size of `{}` on `{}`' + .format(resource.name, volume.storage_pool_name) + ) + current[volume.number] = max(current_size, current.get(volume.number) or 0) - if volume_size is None or current_size > volume_size: - volume_size = current_size - if volume_size is not None: - size += volume_size + total_size = 0 + for volumes in sizes.itervalues(): + for size in volumes.itervalues(): + total_size += size - return size * 1024 + return total_size * 1024 @property def metadata(self): From 5f6ebaa5fdf2a93d18b03ae2517e6c8ffec842dc Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Tue, 9 Aug 2022 11:07:57 +0200 Subject: [PATCH 065/133] feat(LinstorSR): use DRBD openers instead of lsof to log in blktap2 Signed-off-by: Ronan Abhamon --- drivers/blktap2.py | 4 +- drivers/linstor-manager | 48 +-------- drivers/linstorvolumemanager.py | 176 +++++++++++++++++--------------- 3 files changed, 100 insertions(+), 128 deletions(-) diff --git a/drivers/blktap2.py b/drivers/blktap2.py index 21f5dfc5..26ab689a 100755 --- a/drivers/blktap2.py +++ b/drivers/blktap2.py @@ -36,7 +36,7 @@ import xs_errors import XenAPI import scsiutil -from linstorvolumemanager import log_lsof_drbd +from linstorvolumemanager import log_drbd_openers from syslog import openlog, syslog from stat import * # S_ISBLK(), ... import nfs @@ -832,7 +832,7 @@ def launch_on_tap(cls, blktap, path, _type, options): time.sleep(1) continue if err == errno.EROFS: - log_lsof_drbd(path) + log_drbd_openers(path) break try: tapdisk = cls.__from_blktap(blktap) diff --git a/drivers/linstor-manager b/drivers/linstor-manager index 9022499f..4d0ba299 100755 --- a/drivers/linstor-manager +++ b/drivers/linstor-manager @@ -27,7 +27,7 @@ import XenAPI import XenAPIPlugin from linstorjournaler import LinstorJournaler -from linstorvolumemanager import get_controller_uri, LinstorVolumeManager +from linstorvolumemanager import get_controller_uri, get_local_volume_openers, LinstorVolumeManager from lock import Lock import json import LinstorSR @@ -445,22 +445,6 @@ def lock_vdi(session, args): return str(False) -def lsof_resource(session, args): - try: - drbd_path = args['drbdPath'] - (ret, stdout, stderr) = util.doexec(['lsof', '-V', drbd_path]) - if ret == 0: - return 'DRBD resource `{}` is open: {}'.format( - drbd_path, stdout.rstrip() - ) - return '`lsof` on DRBD resource `{}` returned {}: {}'.format( - drbd_path, ret, stdout.rstrip() - ) - except Exception as e: - util.SMlog('linstor-manager:lsof_drbd error: {}'.format(e)) - raise - - def has_controller_running(session, args): (ret, stdout, stderr) = util.doexec([ 'systemctl', 'is-active', '--quiet', 'linstor-controller' @@ -827,34 +811,7 @@ def get_drbd_openers(session, args): try: resource_name = args.get('resourceName') volume = args.get('volume') - if not resource_name or volume is None: - raise Exception('Cannot get DRBD openers without resource name and/or volume.') - - path = '/sys/kernel/debug/drbd/resources/{}/volumes/{}/openers'.format( - resource_name, volume - ) - - with open(path, 'r') as openers: - # Not a big cost, so read all lines directly. - lines = openers.readlines() - - result = {} - - opener_re = re.compile('(.*)\\s+([0-9]+)\\s+([0-9]+)') - for line in lines: - match = opener_re.match(line) - assert match - - groups = match.groups() - process_name = groups[0] - pid = groups[1] - open_duration_ms = groups[2] - result[pid] = { - 'process-name': process_name, - 'open-duration': open_duration_ms - } - - return json.dumps(result) + return get_local_volume_openers(resource_name, volume) except Exception as e: util.SMlog('linstor-manager:get_drbd_openers error: {}'.format(e)) raise @@ -886,7 +843,6 @@ if __name__ == '__main__': 'getBlockBitmap': get_block_bitmap, 'lockVdi': lock_vdi, - 'lsofResource': lsof_resource, 'hasControllerRunning': has_controller_running, 'addHost': add_host, 'removeHost': remove_host, diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py index 3806cc91..6f4c5900 100755 --- a/drivers/linstorvolumemanager.py +++ b/drivers/linstorvolumemanager.py @@ -45,81 +45,56 @@ PLUGIN = 'linstor-manager' -# Check if a path is a DRBD resource and log the process name/pid -# that opened it. -def log_lsof_drbd(path): - PLUGIN_CMD = 'lsofResource' +# ============================================================================== - # Ignore if it's not a symlink to DRBD resource. - if not path.startswith(DRBD_BY_RES_PATH): - return +def get_local_volume_openers(resource_name, volume): + if not resource_name or volume is None: + raise Exception('Cannot get DRBD openers without resource name and/or volume.') - # Compute resource name. - res_name_end = path.find('/', len(DRBD_BY_RES_PATH)) - if res_name_end == -1: - return - res_name = path[len(DRBD_BY_RES_PATH):res_name_end] + path = '/sys/kernel/debug/drbd/resources/{}/volumes/{}/openers'.format( + resource_name, volume + ) - try: - # Ensure path is a DRBD. - drbd_path = os.path.realpath(path) - stats = os.stat(drbd_path) - if not stat.S_ISBLK(stats.st_mode) or os.major(stats.st_rdev) != 147: - return + with open(path, 'r') as openers: + # Not a big cost, so read all lines directly. + lines = openers.readlines() - # Find where the device is open. - (ret, stdout, stderr) = util.doexec(['drbdadm', 'status', res_name]) - if ret != 0: - util.SMlog('Failed to execute `drbdadm status` on `{}`: {}'.format( - res_name, stderr - )) - return + result = {} - # Is it a local device? - if stdout.startswith('{} role:Primary'.format(res_name)): - (ret, stdout, stderr) = util.doexec(['lsof', drbd_path]) - if ret == 0: - util.SMlog( - 'DRBD resource `{}` is open on local host: {}' - .format(path, stdout) - ) - else: - util.SMlog( - '`lsof` on local DRBD resource `{}` returned {}: {}' - .format(path, ret, stderr) - ) - return + opener_re = re.compile('(.*)\\s+([0-9]+)\\s+([0-9]+)') + for line in lines: + match = opener_re.match(line) + assert match - # Is it a remote device? - res = REG_DRBDADM_PRIMARY.search(stdout) - if not res: - util.SMlog( - 'Cannot find where is open DRBD resource `{}`' - .format(path) - ) - return - node_name = res.groups()[0] + groups = match.groups() + process_name = groups[0] + pid = groups[1] + open_duration_ms = groups[2] + result[pid] = { + 'process-name': process_name, + 'open-duration': open_duration_ms + } - session = util.get_localAPI_session() - hosts = session.xenapi.host.get_all_records() - for host_ref, host_record in hosts.items(): - if node_name != host_record['hostname']: - continue + return json.dumps(result) - ret = session.xenapi.host.call_plugin( - host_ref, PLUGIN, PLUGIN_CMD, {'drbdPath': drbd_path}, - ) - util.SMlog('DRBD resource `{}` status on host `{}`: {}'.format( - path, host_ref, ret - )) - return - util.SMlog('Cannot find primary host of DRBD resource {}'.format(path)) - except Exception as e: - util.SMlog( - 'Got exception while trying to determine where DRBD resource ' + - '`{}` is open: {}'.format(path, e) +def get_all_volume_openers(resource_name, volume): + PLUGIN_CMD = 'getDrbdOpeners' + + volume = str(volume) + openers = {} + + session = util.get_localAPI_session() + hosts = session.xenapi.host.get_all_records() + for host_ref, host_record in hosts.items(): + openers[host_record['hostname']] = json.loads( + session.xenapi.host.call_plugin(host_ref, PLUGIN, PLUGIN_CMD, { + 'resourceName': resource_name, + 'volume': volume + }) ) + return openers + # ============================================================================== @@ -1042,22 +1017,8 @@ def get_volume_openers(self, volume_uuid): :return: A dictionnary that contains openers. :rtype: dict(str, obj) """ + return get_all_volume_openers(self.get_volume_name(volume_uuid), '0') - PLUGIN_CMD = 'getDrbdOpeners' - - openers = {} - - session = util.get_localAPI_session() - hosts = session.xenapi.host.get_all_records() - for host_ref, host_record in hosts.items(): - openers[host_record['hostname']] = json.loads( - session.xenapi.host.call_plugin(host_ref, PLUGIN, PLUGIN_CMD, { - 'resourceName': self.get_volume_name(volume_uuid), - 'volume': '0' - }) - ) - - return openers def get_volumes_with_name(self): """ @@ -2755,3 +2716,58 @@ def _mount_volume(cls, volume_path, mountpoint, mount=True): 'Failed to umount volume {} on {}: {}' .format(volume_path, mountpoint, e) ) + + +# ============================================================================== + +# Check if a path is a DRBD resource and log the process name/pid +# that opened it. +def log_drbd_openers(path): + # Ignore if it's not a symlink to DRBD resource. + if not path.startswith(DRBD_BY_RES_PATH): + return + + # Compute resource name. + res_name_end = path.find('/', len(DRBD_BY_RES_PATH)) + if res_name_end == -1: + return + res_name = path[len(DRBD_BY_RES_PATH):res_name_end] + + volume_end = path.rfind('/') + if volume_end == res_name_end: + return + volume = path[volume_end + 1:] + + try: + # Ensure path is a DRBD. + drbd_path = os.path.realpath(path) + stats = os.stat(drbd_path) + if not stat.S_ISBLK(stats.st_mode) or os.major(stats.st_rdev) != 147: + return + + # Find where the device is open. + (ret, stdout, stderr) = util.doexec(['drbdadm', 'status', res_name]) + if ret != 0: + util.SMlog('Failed to execute `drbdadm status` on `{}`: {}'.format( + res_name, stderr + )) + return + + # Is it a local device? + if stdout.startswith('{} role:Primary'.format(res_name)): + util.SMlog( + 'DRBD resource `{}` is open on local host: {}' + .format(path, get_local_volume_openers(res_name, volume)) + ) + return + + # Is it a remote device? + util.SMlog( + 'DRBD resource `{}` is open on hosts: {}' + .format(path, get_all_volume_openers(res_name, volume)) + ) + except Exception as e: + util.SMlog( + 'Got exception while trying to determine where DRBD resource ' + + '`{}` is open: {}'.format(path, e) + ) From 54727abb411ba6759751e3db61cf3b61fba3465e Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Thu, 25 Aug 2022 12:11:18 +0200 Subject: [PATCH 066/133] feat(LinstorSR): support cProfile to trace calls when a command is executed Signed-off-by: Ronan Abhamon --- drivers/LinstorSR.py | 11 ++++++++++- drivers/util.py | 42 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 1 deletion(-) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index e5f6f85c..00554d73 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -69,6 +69,9 @@ # - The other hearbeat volumes must be diskful and limited to a maximum of 3. USE_HTTP_NBD_SERVERS = True +# Useful flag to trace calls using cProfile. +TRACE_PERFS = False + # ============================================================================== # TODO: Supports 'VDI_INTRODUCE', 'VDI_RESET_ON_BOOT/2', 'SR_TRIM', @@ -2779,6 +2782,12 @@ def _detach_using_http_nbd(self): if __name__ == '__main__': - SRCommand.run(LinstorSR, DRIVER_INFO) + def run(): + SRCommand.run(LinstorSR, DRIVER_INFO) + + if not TRACE_PERFS: + run() + else: + util.make_profile('LinstorSR', run) else: SR.registerSR(LinstorSR) diff --git a/drivers/util.py b/drivers/util.py index 484c5f70..2ccfe3d6 100755 --- a/drivers/util.py +++ b/drivers/util.py @@ -1857,3 +1857,45 @@ def check_pid_exists(pid): return False else: return True + + +def make_profile(name, function): + """ + Helper to execute cProfile using unique log file. + """ + + import cProfile + import itertools + import os.path + import time + + assert name + assert function + + FOLDER = '/tmp/sm-perfs/' + makedirs(FOLDER) + + filename = time.strftime('{}_%Y%m%d_%H%M%S.prof'.format(name)) + + def gen_path(path): + yield path + root, ext = os.path.splitext(path) + for i in itertools.count(start=1, step=1): + yield root + '.{}.'.format(i) + ext + + for profile_path in gen_path(FOLDER + filename): + try: + file = open_atomic(profile_path, 'w') + file.close() + break + except OSError as e: + if e.errno == errno.EEXIST: + pass + else: + raise + + try: + SMlog('* Start profiling of {} ({}) *'.format(name, filename)) + cProfile.runctx('function()', None, locals(), profile_path) + finally: + SMlog('* End profiling of {} ({}) *'.format(name, filename)) From 06222cef6bd16a960e298ff1039d5a7614c6f58d Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Wed, 24 Aug 2022 17:09:11 +0200 Subject: [PATCH 067/133] fix(LinstorJournaler): reset namespace when `get` is called Otherwise, we can be in the wrong namespace and the key to find will be inaccessible. --- drivers/linstorjournaler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/linstorjournaler.py b/drivers/linstorjournaler.py index 3993f601..1e85ec96 100755 --- a/drivers/linstorjournaler.py +++ b/drivers/linstorjournaler.py @@ -107,6 +107,7 @@ def remove(self, type, identifier): ) def get(self, type, identifier): + self._reset_namespace() return self._journal.get(self._get_key(type, identifier)) def get_all(self, type): From a78d922a274a9d2ba5ecd335a1fc7f6e02c9187c Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Thu, 25 Aug 2022 10:54:56 +0200 Subject: [PATCH 068/133] fix(linstorvhdutil): fix coalesce with VM running under specific scenario: When a VM is running, we can't coalesce without this patch with a long chain of VHDs because a parent can be in use on another host, and so a EROFS can be emitted by vhd-util. So to fix this problem we run vhd-util on the remote host instead of the master in case of failure in the cleanup algorithm. Impacted vhd-util functions: coalesce, getParent, repair. --- drivers/LinstorSR.py | 12 +- drivers/cleanup.py | 22 +++- drivers/linstor-manager | 37 ++++++ drivers/linstorvhdutil.py | 269 ++++++++++++++++++++++++++------------ drivers/vhdutil.py | 5 +- 5 files changed, 251 insertions(+), 94 deletions(-) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index 00554d73..47ac3c85 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -163,7 +163,9 @@ def attach_thin(session, journaler, linstor, sr_uuid, vdi_uuid): # If the virtual VHD size is lower than the LINSTOR volume size, # there is nothing to do. vhd_size = compute_volume_size( - LinstorVhdUtil(session, linstor).get_size_virt(vdi_uuid), + # TODO: Replace pylint comment with this feature when possible: + # https://github.com/PyCQA/pylint/pull/2926 + LinstorVhdUtil(session, linstor).get_size_virt(vdi_uuid), # pylint: disable = E1120 image_type ) @@ -207,7 +209,9 @@ def detach_thin(session, linstor, sr_uuid, vdi_uuid): device_path = linstor.get_device_path(vdi_uuid) new_volume_size = LinstorVolumeManager.round_up_volume_size( - LinstorVhdUtil(session, linstor).get_size_phys(device_path) + # TODO: Replace pylint comment with this feature when possible: + # https://github.com/PyCQA/pylint/pull/2926 + LinstorVhdUtil(session, linstor).get_size_phys(device_path) # pylint: disable = E1120 ) volume_info = linstor.get_volume_info(vdi_uuid) @@ -1231,8 +1235,10 @@ def _load_vdis_ex(self): self.vdis[vdi_uuid] = vdi if vdi.vdi_type == vhdutil.VDI_TYPE_VHD: + # TODO: Replace pylint comment with this feature when possible: + # https://github.com/PyCQA/pylint/pull/2926 vdi.sm_config_override['key_hash'] = \ - self._vhdutil.get_key_hash(vdi_uuid) + self._vhdutil.get_key_hash(vdi_uuid) # pylint: disable = E1120 # 4.c. Update CBT status of disks either just added # or already in XAPI. diff --git a/drivers/cleanup.py b/drivers/cleanup.py index 937296b1..9d55baac 100755 --- a/drivers/cleanup.py +++ b/drivers/cleanup.py @@ -840,12 +840,14 @@ def _reportCoalesceError(vdi, ce): xapi.message.create(msg_name, "3", "SR", vdi.sr.uuid, msg_body) _reportCoalesceError = staticmethod(_reportCoalesceError) + def coalesce(self): + vhdutil.coalesce(self.path) + def _doCoalesceVHD(vdi): try: - startTime = time.time() vhdSize = vdi.getSizeVHD() - vhdutil.coalesce(vdi.path) + vdi.coalesce() endTime = time.time() vdi.sr.recordStorageSpeed(startTime, endTime, vhdSize) except util.CommandException, ce: @@ -1437,6 +1439,9 @@ def pause(self, failfast=False): ) return super(LinstorVDI, self).pause(failfast) + def coalesce(self): + self.sr._vhdutil.force_coalesce(self.path) + def _relinkSkip(self): abortFlag = IPCFlag(self.sr.uuid) for child in self.children: @@ -1461,6 +1466,19 @@ def _relinkSkip(self): blktap2.VDI.tap_unpause(session, sr_uuid, vdi_uuid) self.children = [] + def _setParent(self, parent): + self.sr._vhdutil.force_parent(self.path, parent.path) + self.parent = parent + self.parentUuid = parent.uuid + parent.children.append(self) + try: + self.setConfig(self.DB_VHD_PARENT, self.parentUuid) + Util.log("Updated the vhd-parent field for child %s with %s" % \ + (self.uuid, self.parentUuid)) + except: + Util.log("Failed to update %s with vhd-parent field %s" % \ + (self.uuid, self.parentUuid)) + def _setHidden(self, hidden=True): HIDDEN_TAG = 'hidden' diff --git a/drivers/linstor-manager b/drivers/linstor-manager index 4d0ba299..5485b900 100755 --- a/drivers/linstor-manager +++ b/drivers/linstor-manager @@ -416,6 +416,37 @@ def get_block_bitmap(session, args): raise +def set_parent(session, args): + try: + device_path = args['devicePath'] + parent_path = args['parentPath'] + vhdutil.setParent(device_path, parent_path, False) + return '' + except Exception as e: + util.SMlog('linstor-manager:set_parent error: {}'.format(e)) + raise + + +def coalesce(session, args): + try: + device_path = args['devicePath'] + vhdutil.coalesce(device_path) + return '' + except Exception as e: + util.SMlog('linstor-manager:coalesce error: {}'.format(e)) + raise + + +def repair(session, args): + try: + device_path = args['devicePath'] + vhdutil.repair(device_path) + return '' + except Exception as e: + util.SMlog('linstor-manager:repair error: {}'.format(e)) + raise + + def lock_vdi(session, args): lock = None try: @@ -842,6 +873,12 @@ if __name__ == '__main__': 'getKeyHash': get_key_hash, 'getBlockBitmap': get_block_bitmap, + # Called by cleanup.py to coalesce when a primary + # is opened on a non-local host. + 'setParent': set_parent, + 'coalesce': coalesce, + 'repair': repair, + 'lockVdi': lock_vdi, 'hasControllerRunning': has_controller_running, 'addHost': add_host, diff --git a/drivers/linstorvhdutil.py b/drivers/linstorvhdutil.py index d6a21c26..4d031e12 100644 --- a/drivers/linstorvhdutil.py +++ b/drivers/linstorvhdutil.py @@ -25,39 +25,30 @@ MANAGER_PLUGIN = 'linstor-manager' +# EMEDIUMTYPE constant (124) is not available in python2. +EMEDIUMTYPE = 124 -def call_vhd_util(linstor, func, device_path, *args, **kwargs): - try: - return func(device_path, *args, **kwargs) - except util.CommandException as e: - # Raise if we don't have a lock on the volume on another host. - if e.code != errno.EROFS: - raise - - # Volume is locked on a host, find openers. - e_with_openers = None + +def call_vhd_util_on_host(session, host_ref, method, device_path, args): try: - volume_uuid = linstor.get_volume_uuid_from_device_path( - device_path + response = session.xenapi.host.call_plugin( + host_ref, MANAGER_PLUGIN, method, args ) - e_with_openers = util.CommandException( - e.code, - e.cmd, - e.reason + ' (openers: {})'.format( - linstor.get_volume_openers(volume_uuid) - ) - ) - except Exception as illformed_e: - raise util.CommandException( - e.code, - e.cmd, - e.reason + ' (unable to get openers: {})'.format(illformed_e) - ) - raise e_with_openers # pylint: disable = E0702 + except Exception as e: + util.SMlog('call-plugin ({} with {}) exception: {}'.format( + method, args, e + )) + raise + + util.SMlog('call-plugin ({} with {}) returned: {}'.format( + method, args, response + )) + + return response def linstorhostcall(local_method, remote_method): - def decorated(func): + def decorated(response_parser): def wrapper(*args, **kwargs): self = args[0] vdi_uuid = args[1] @@ -76,45 +67,27 @@ def wrapper(*args, **kwargs): try: if not in_use or socket.gethostname() in node_names: - return call_vhd_util(self._linstor, local_method, device_path, *args[2:], **kwargs) + return self._call_local_vhd_util(local_method, device_path, *args[2:], **kwargs) except util.CommandException as e: - # EMEDIUMTYPE constant (124) is not available in python2. - if e.code != errno.EROFS and e.code != 124: + if e.code != errno.EROFS and e.code != EMEDIUMTYPE: raise # B. Execute the plugin on master or slave. - def exec_remote_method(): + remote_args = { + 'devicePath': device_path, + 'groupName': self._linstor.group_name + } + remote_args.update(**kwargs) + remote_args = {str(key): str(value) for key, value in remote_args.iteritems()} + + def remote_call(): host_ref = self._get_readonly_host( vdi_uuid, device_path, node_names ) - args = { - 'devicePath': device_path, - 'groupName': self._linstor.group_name - } - args.update(**kwargs) - - try: - response = self._session.xenapi.host.call_plugin( - host_ref, MANAGER_PLUGIN, remote_method, args - ) - except Exception as e: - util.SMlog('call-plugin ({} with {}) exception: {}'.format( - remote_method, args, e - )) - raise + return call_vhd_util_on_host(self._session, host_ref, remote_method, device_path, remote_args) + response = util.retry(remote_call, 5, 2) - util.SMlog('call-plugin ({} with {}) returned: {}'.format( - remote_method, args, response - )) - if response == 'False': - raise xs_errors.XenError( - 'VDIUnavailable', - opterr='Plugin {} failed'.format(MANAGER_PLUGIN) - ) - kwargs['response'] = response - - util.retry(exec_remote_method, 5, 3) - return func(*args, **kwargs) + return response_parser(self, vdi_uuid, response) return wrapper return decorated @@ -137,7 +110,7 @@ def __init__(self, session, linstor): self._linstor = linstor # -------------------------------------------------------------------------- - # Getters. + # Getters: read locally and try on another host in case of failure. # -------------------------------------------------------------------------- def check(self, vdi_uuid, ignore_missing_footer=False, fast=False): @@ -153,11 +126,13 @@ def _check(self, vdi_uuid, **kwargs): def get_vhd_info(self, vdi_uuid, include_parent=True): kwargs = {'includeParent': str(include_parent)} - return self._get_vhd_info(vdi_uuid, self._extract_uuid, **kwargs) + # TODO: Replace pylint comment with this feature when possible: + # https://github.com/PyCQA/pylint/pull/2926 + return self._get_vhd_info(vdi_uuid, self._extract_uuid, **kwargs) # pylint: disable = E1123 @linstorhostcall(vhdutil.getVHDInfo, 'getVHDInfo') - def _get_vhd_info(self, vdi_uuid, *args, **kwargs): - obj = json.loads(kwargs['response']) + def _get_vhd_info(self, vdi_uuid, response): + obj = json.loads(response) vhd_info = vhdutil.VHDInfo(vdi_uuid) vhd_info.sizeVirt = obj['sizeVirt'] @@ -171,71 +146,91 @@ def _get_vhd_info(self, vdi_uuid, *args, **kwargs): return vhd_info @linstorhostcall(vhdutil.hasParent, 'hasParent') - def has_parent(self, vdi_uuid, **kwargs): - return distutils.util.strtobool(kwargs['response']) + def has_parent(self, vdi_uuid, response): + return distutils.util.strtobool(response) def get_parent(self, vdi_uuid): return self._get_parent(vdi_uuid, self._extract_uuid) @linstorhostcall(vhdutil.getParent, 'getParent') - def _get_parent(self, vdi_uuid, *args, **kwargs): - return kwargs['response'] + def _get_parent(self, vdi_uuid, response): + return response @linstorhostcall(vhdutil.getSizeVirt, 'getSizeVirt') - def get_size_virt(self, vdi_uuid, **kwargs): - return int(kwargs['response']) + def get_size_virt(self, vdi_uuid, response): + return int(response) @linstorhostcall(vhdutil.getSizePhys, 'getSizePhys') - def get_size_phys(self, vdi_uuid, **kwargs): - return int(kwargs['response']) + def get_size_phys(self, vdi_uuid, response): + return int(response) @linstorhostcall(vhdutil.getDepth, 'getDepth') - def get_depth(self, vdi_uuid, **kwargs): - return int(kwargs['response']) + def get_depth(self, vdi_uuid, response): + return int(response) @linstorhostcall(vhdutil.getKeyHash, 'getKeyHash') - def get_key_hash(self, vdi_uuid, **kwargs): - return kwargs['response'] or None + def get_key_hash(self, vdi_uuid, response): + return response or None @linstorhostcall(vhdutil.getBlockBitmap, 'getBlockBitmap') - def get_block_bitmap(self, vdi_uuid, **kwargs): - return base64.b64decode(kwargs['response']) + def get_block_bitmap(self, vdi_uuid, response): + return base64.b64decode(response) # -------------------------------------------------------------------------- - # Setters. + # Setters: only used locally. # -------------------------------------------------------------------------- @linstormodifier() def create(self, path, size, static, msize=0): - return call_vhd_util(self._linstor, vhdutil.create, path, size, static, msize) + return self._call_local_vhd_util(vhdutil.create, path, size, static, msize) @linstormodifier() def set_size_virt_fast(self, path, size): - return call_vhd_util(self._linstor, vhdutil.setSizeVirtFast, path, size) + return self._call_local_vhd_util(vhdutil.setSizeVirtFast, path, size) @linstormodifier() def set_size_phys(self, path, size, debug=True): - return call_vhd_util(self._linstor, vhdutil.setSizePhys, path, size, debug) + return self._call_local_vhd_util(vhdutil.setSizePhys, path, size, debug) @linstormodifier() - def set_parent(self, path, parentPath, parentRaw): - return call_vhd_util(self._linstor, vhdutil.setParent, path, parentPath, parentRaw) + def set_parent(self, path, parentPath, parentRaw=False): + return self._call_local_vhd_util(vhdutil.setParent, path, parentPath, parentRaw) @linstormodifier() def set_hidden(self, path, hidden=True): - return call_vhd_util(self._linstor, vhdutil.setHidden, path, hidden) + return self._call_local_vhd_util(vhdutil.setHidden, path, hidden) @linstormodifier() def set_key(self, path, key_hash): - return call_vhd_util(self._linstor, vhdutil.setKey, path, key_hash) + return self._call_local_vhd_util(vhdutil.setKey, path, key_hash) @linstormodifier() def kill_data(self, path): - return call_vhd_util(self._linstor, vhdutil.killData, path) + return self._call_local_vhd_util(vhdutil.killData, path) @linstormodifier() def snapshot(self, path, parent, parentRaw, msize=0, checkEmpty=True): - return call_vhd_util(self._linstor, vhdutil.snapshot, path, parent, parentRaw, msize, checkEmpty) + return self._call_local_vhd_util(vhdutil.snapshot, path, parent, parentRaw, msize, checkEmpty) + + # -------------------------------------------------------------------------- + # Remote setters: write locally and try on another host in case of failure. + # -------------------------------------------------------------------------- + + @linstormodifier() + def force_parent(self, path, parentPath, parentRaw=False): + kwargs = { + 'parentPath': str(parentPath), + 'parentRaw': parentRaw + } + return self._call_vhd_util(vhdutil.setParent, 'setParent', path, **kwargs) + + @linstormodifier() + def force_coalesce(self, path): + return self._call_vhd_util(vhdutil.coalesce, 'coalesce', path) + + @linstormodifier() + def force_repair(self, path): + return self._call_vhd_util(vhdutil.repair, 'repair', path) # -------------------------------------------------------------------------- # Helpers. @@ -273,3 +268,105 @@ def _get_readonly_host(self, vdi_uuid, device_path, node_names): opterr='Unable to find a valid host from VDI: {} (path={})' .format(vdi_uuid, device_path) ) + + # -------------------------------------------------------------------------- + + def _call_local_vhd_util(self, local_method, device_path, *args, **kwargs): + try: + def local_call(): + return local_method(device_path, *args, **kwargs) + return util.retry(local_call, 5, 2) + except util.CommandException as e: + if e.code != errno.EROFS and e.code != EMEDIUMTYPE: + raise + + # Volume is locked on a host, find openers. + e_with_openers = None + try: + volume_uuid = self._linstor.get_volume_uuid_from_device_path( + device_path + ) + e_with_openers = util.CommandException( + e.code, + e.cmd, + e.reason + ' (openers: {})'.format( + self._linstor.get_volume_openers(volume_uuid) + ) + ) + except Exception as illformed_e: + raise util.CommandException( + e.code, + e.cmd, + e.reason + ' (unable to get openers: {})'.format(illformed_e) + ) + raise e_with_openers # pylint: disable = E0702 + + def _call_vhd_util(self, local_method, remote_method, device_path, *args, **kwargs): + # A. Try to write locally... + try: + def local_call(): + return local_method(device_path, *args, **kwargs) + return util.retry(local_call, 5, 2) + except util.CommandException as e: + if e.code != errno.EROFS and e.code != EMEDIUMTYPE: + raise + + # B. Execute the command on another host. + # B.1. Get host list. + try: + hosts = self._session.xenapi.host.get_all_records() + except Exception as e: + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='Unable to get host list to run vhd-util command `{}` (path={}): {}' + .format(remote_method, device_path, e) + ) + + # B.2. Prepare remote args. + remote_args = { + 'devicePath': device_path, + 'groupName': self._linstor.group_name + } + remote_args.update(**kwargs) + remote_args = {str(key): str(value) for key, value in remote_args.iteritems()} + + volume_uuid = self._linstor.get_volume_uuid_from_device_path( + device_path + ) + + # B.3. Call! + def remote_call(): + try: + all_openers = self._linstor.get_volume_openers(volume_uuid) + except Exception as e: + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='Unable to get DRBD openers to run vhd-util command `{}` (path={}): {}' + .format(remote_method, device_path, e) + ) + + no_host_found = True + for hostname, openers in all_openers.iteritems(): + if not openers: + continue + + try: + host_ref = next(ref for ref, rec in hosts.iteritems() if rec['hostname'] == hostname) + except StopIteration: + continue + + no_host_found = False + try: + return call_vhd_util_on_host(self._session, host_ref, remote_method, device_path, remote_args) + except Exception: + pass + + if no_host_found: + return local_method(device_path, *args, **kwargs) + + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='No valid host found to run vhd-util command `{}` (path={}): {}' + .format(remote_method, device_path, e) + ) + return util.retry(remote_call, 5, 2) diff --git a/drivers/vhdutil.py b/drivers/vhdutil.py index 0a8fe918..d75edb11 100755 --- a/drivers/vhdutil.py +++ b/drivers/vhdutil.py @@ -97,9 +97,8 @@ def calcOverheadFull(virtual_size): def fullSizeVHD(virtual_size): return virtual_size + calcOverheadFull(virtual_size) -def ioretry(cmd): - return util.ioretry(lambda: util.pread2(cmd), - errlist = [errno.EIO, errno.EROFS, errno.EAGAIN]) +def ioretry(cmd, errlist=[errno.EIO, errno.EAGAIN]): + return util.ioretry(lambda: util.pread2(cmd), errlist) def getVHDInfo(path, extractUuidFunction, includeParent = True): """Get the VHD info. The parent info may optionally be omitted: vhd-util From d1d1fabbb57d54967869643ad45c311ca85b16d6 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Mon, 5 Sep 2022 15:09:17 +0200 Subject: [PATCH 069/133] fix(linstorvolumemanager): `_get_volumes_info` doesn't raise with offline host Ensure this method doesn't raise an exception when a host is offline. Otherwise we can't use properly the HA when a host is unreachable and it's a problem to restart VMs on a valid host. Signed-off-by: Ronan Abhamon --- drivers/linstorvolumemanager.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py index 6f4c5900..a1bc151e 100755 --- a/drivers/linstorvolumemanager.py +++ b/drivers/linstorvolumemanager.py @@ -1789,15 +1789,18 @@ def _get_volumes_info(self, volume_name=None): max(current.allocated_size, allocated_size) or \ allocated_size - if volume.usable_size < 0: - raise LinstorVolumeManagerError( - 'Failed to get usable size of `{}` on `{}`' - .format(resource.name, volume.storage_pool_name) - ) - virtual_size = volume.usable_size - - current.virtual_size = current.virtual_size and \ - min(current.virtual_size, virtual_size) or virtual_size + usable_size = volume.usable_size + if usable_size > 0 and ( + usable_size < current.virtual_size or + not current.virtual_size + ): + current.virtual_size = usable_size + + if current.virtual_size <= 0: + raise LinstorVolumeManagerError( + 'Failed to get usable size of `{}` on `{}`' + .format(resource.name, volume.storage_pool_name) + ) for current in all_volume_info.values(): current.allocated_size *= 1024 From a4042633f227611f794be254bc23a289136a8942 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Mon, 12 Sep 2022 15:56:09 +0200 Subject: [PATCH 070/133] fix(linstorvolumemanager): remove double prefix on kv group name - Before this patch, when the kv store was created/accessed, a double "xcp-sr-" prefix was used. - This change is not compatible with existing LINSTOR SR instances! Signed-off-by: Ronan Abhamon --- drivers/linstorvolumemanager.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py index a1bc151e..3ee5d248 100755 --- a/drivers/linstorvolumemanager.py +++ b/drivers/linstorvolumemanager.py @@ -2195,7 +2195,7 @@ def _get_volumes_by_property( def _create_linstor_kv(self, namespace): return linstor.KV( - self._get_store_name(), + self._group_name, uri=self._linstor.controller_host(), namespace=namespace ) @@ -2205,9 +2205,6 @@ def _get_volume_properties(self, volume_uuid): properties.namespace = self._build_volume_namespace(volume_uuid) return properties - def _get_store_name(self): - return 'xcp-sr-{}'.format(self._group_name) - @classmethod def _build_sr_namespace(cls): return '/{}/'.format(cls.NAMESPACE_SR) From 92967fafb7c73444fa4390cf40bba700912799c0 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Mon, 12 Sep 2022 17:54:57 +0200 Subject: [PATCH 071/133] feat(LinstorSR): add linstor-kv-dump helper to print kv store Signed-off-by: Ronan Abhamon --- Makefile | 1 + scripts/linstor-kv-dump | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) create mode 100755 scripts/linstor-kv-dump diff --git a/Makefile b/Makefile index 89f7762c..d0f043f5 100755 --- a/Makefile +++ b/Makefile @@ -239,6 +239,7 @@ install: precheck install -m 755 drivers/fcoelib.py $(SM_STAGING)$(SM_DEST) mkdir -p $(SM_STAGING)$(LIBEXEC) install -m 755 scripts/fork-log-daemon $(SM_STAGING)$(LIBEXEC) + install -m 755 scripts/linstor-kv-dump $(SM_STAGING)$(BIN_DEST) install -m 755 scripts/local-device-change $(SM_STAGING)$(LIBEXEC) install -m 755 scripts/check-device-sharing $(SM_STAGING)$(LIBEXEC) install -m 755 scripts/usb_change $(SM_STAGING)$(LIBEXEC) diff --git a/scripts/linstor-kv-dump b/scripts/linstor-kv-dump new file mode 100755 index 00000000..93598d7c --- /dev/null +++ b/scripts/linstor-kv-dump @@ -0,0 +1,38 @@ +#!/usr/bin/env python +# +# Copyright (C) 2022 Vates SAS +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import argparse +import json +import linstor + +def dump_kv(controller_uri, group_name, namespace): + kv = linstor.KV( + group_name, + uri=controller_uri, + namespace=namespace + ) + print(json.dumps(kv, sort_keys=True, indent=2)) + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('-u', '--uri', required=True) + parser.add_argument('-g', '--group-name', required=True) + parser.add_argument('-n', '--namespace', default='/') + args = parser.parse_args() + dump_kv(args.uri, args.group_name, args.namespace) + +if __name__ == '__main__': + main() From 62685a888a9122d4a919c8c13f80c99d296df481 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Wed, 14 Sep 2022 10:17:18 +0200 Subject: [PATCH 072/133] fix(LinstorSR): disable VHD key hash usage to limit exec time Signed-off-by: Ronan Abhamon --- drivers/LinstorSR.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index 47ac3c85..374d6cb9 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -72,6 +72,9 @@ # Useful flag to trace calls using cProfile. TRACE_PERFS = False +# Enable/Disable VHD key hash support. +USE_KEY_HASH = False + # ============================================================================== # TODO: Supports 'VDI_INTRODUCE', 'VDI_RESET_ON_BOOT/2', 'SR_TRIM', @@ -1234,7 +1237,7 @@ def _load_vdis_ex(self): vdi = self.vdi(vdi_uuid) self.vdis[vdi_uuid] = vdi - if vdi.vdi_type == vhdutil.VDI_TYPE_VHD: + if USE_KEY_HASH and vdi.vdi_type == vhdutil.VDI_TYPE_VHD: # TODO: Replace pylint comment with this feature when possible: # https://github.com/PyCQA/pylint/pull/2926 vdi.sm_config_override['key_hash'] = \ From 2cfd01a1d5e5268a71364ce66a9f615be714f7ae Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Thu, 15 Sep 2022 11:34:25 +0200 Subject: [PATCH 073/133] fix(minidrbdcluster): ensure SIGINT is handled correctly This patch is here to make sure no LINSTOR controller survives when systemd asks to minidrbdcluster to stop with `SIGINT`. - Remove `os.system`, it's totally unsafe, all signals are ignored with it. - Use `subprocess.Popen` instead and catch correctly signal exceptions, it works because `wait` call doesn't hide the signals. - Ensure `SIGINT` is only sent to the main process, not to the subprocesses. - Ensure `SIGKILL` is NEVER sent to minidrbdcluster. Signed-off-by: Ronan Abhamon --- scripts/minidrbdcluster | 35 ++++++++++++++++++++++++--------- systemd/minidrbdcluster.service | 1 + 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/scripts/minidrbdcluster b/scripts/minidrbdcluster index fb4de09b..4cdc59e6 100755 --- a/scripts/minidrbdcluster +++ b/scripts/minidrbdcluster @@ -1,7 +1,6 @@ #! /usr/bin/env python2 import configparser -import os import re import signal import subprocess @@ -35,24 +34,42 @@ def sig_handler(sig, frame): ) +def preexec_subprocess(): + signal.signal(signal.SIGINT, signal.SIG_IGN) + + +def exec_subprocess(args): + proc = subprocess.Popen(args, preexec_fn=preexec_subprocess) + raise_sigint = False + while True: + try: + proc.wait() + break + except KeyboardInterrupt: + raise_sigint = True + except: # noqa: E722 + pass + + if raise_sigint: + raise KeyboardInterrupt + + return proc.returncode + + def call_systemd(operation, service): verbose = operation in ('start', 'stop') if verbose: print('Trying to %s %s' % (operation, service)) - r = os.system('systemctl %s %s' % (operation, service)) + ret = exec_subprocess(['systemctl', operation, service]) if verbose: print('%s for %s %s' % ( - 'success' if r == 0 else 'failure', operation, service + 'success' if ret == 0 else 'failure', operation, service )) - return r == 0 + return ret == 0 def ensure_systemd_started(service): - args = ['systemctl', 'is-active', '--quiet', service] - - proc = subprocess.Popen(args) - proc.wait() - if not proc.returncode: + if not exec_subprocess(['systemctl', 'is-active', '--quiet', service]): return True # Already active. return call_systemd('start', service) diff --git a/systemd/minidrbdcluster.service b/systemd/minidrbdcluster.service index 3de6ac4f..1ddf91f3 100644 --- a/systemd/minidrbdcluster.service +++ b/systemd/minidrbdcluster.service @@ -10,6 +10,7 @@ Environment=PYTHONUNBUFFERED=1 ExecStart=/opt/xensource/libexec/minidrbdcluster KillMode=process KillSignal=SIGINT +SendSIGKILL=no StandardOutput=journal StandardError=journal SyslogIdentifier=minidrbdcluster From 8c34216cc5c9c650baf5719303237bcaf1edb49d Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Thu, 15 Sep 2022 11:49:34 +0200 Subject: [PATCH 074/133] feat(minidrbdcluster): stop resource services at startup - Ensure all services are stopped when minidrbcluster is started. - Clean code to parse only once the systemd unit string. - Log unhandled exceptions. Signed-off-by: Ronan Abhamon --- scripts/minidrbdcluster | 50 ++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 18 deletions(-) diff --git a/scripts/minidrbdcluster b/scripts/minidrbdcluster index 4cdc59e6..eae7cbfe 100755 --- a/scripts/minidrbdcluster +++ b/scripts/minidrbdcluster @@ -83,8 +83,7 @@ def show_status(services, status): print('%s is %s' % (res_name, status[res_name])) -def clean_up(services): - print('exiting:') +def stop_services(services): for systemd_unit in reversed(services): call_systemd('stop', systemd_unit) @@ -98,18 +97,17 @@ def get_systemd_units(systemd_units_str): return systemd_units -def process(events2, resources, services, status): +def process(events2, resources, running_services, status): line = events2.stdout.readline() m = MAY_PROMOT_RE.match(line) if m: res_name, may_promote, promotion_score = m.groups() if res_name in resources and may_promote == 'yes': - systemd_units_str = resources[res_name]['systemd-units'] - for systemd_unit in get_systemd_units(systemd_units_str): + for systemd_unit in resources[res_name]['systemd-units']: if not ensure_systemd_started(systemd_unit): break - if systemd_unit not in services: - services.append(systemd_unit) + if systemd_unit not in running_services: + running_services.append(systemd_unit) m = PEER_ROLE_RE.match(line) if m: res_name, conn_name, role = m.groups() @@ -119,15 +117,14 @@ def process(events2, resources, services, status): if m: res_name, have_quorum = m.groups() if res_name in resources and have_quorum == 'no': - systemd_units_str = resources[res_name]['systemd-units'] - systemd_units = get_systemd_units(systemd_units_str) - to_stop = [x for x in systemd_units if x in services] + systemd_units = resources[res_name]['systemd-units'] + to_stop = [x for x in systemd_units if x in running_services] if to_stop: print('Lost quorum on %s' % (res_name)) for systemd_unit in reversed(to_stop): r = call_systemd('stop', systemd_unit) if r: - services.remove(systemd_unit) + running_services.remove(systemd_unit) def active_drbd_volume(res_name): @@ -152,8 +149,7 @@ def active_drbd_volume(res_name): def main(): - services = [] - status = dict() + # 1. Load minidrbdcluster config. config = configparser.ConfigParser() config.read('/etc/minidrbdcluster.ini') resources = config._sections @@ -162,12 +158,28 @@ def main(): 'No resources to watch, maybe /etc/minidrbdcluster.ini missing' ) print('Managing DRBD resources: %s' % (' '.join(resources))) - for res_name in resources: + + # 2. Prepare resources. + status = dict() + all_services = [] # Contains common services between each DRBD volumes. + for res_name, resource in resources.iteritems(): status[res_name] = dict() active_drbd_volume(res_name) + systemd_units = get_systemd_units(resource['systemd-units']) + resource['systemd-units'] = systemd_units + + for systemd_unit in systemd_units: + if systemd_unit not in all_services: + all_services.append(systemd_unit) + # 3. Ensure all services are stopped. + stop_services(all_services) + + # 4. Run! signal.signal(signal.SIGHUP, sig_handler) + running_services = [] + print('Starting process...') events2 = subprocess.Popen( ['drbdsetup', 'events2'], stdout=subprocess.PIPE @@ -175,14 +187,16 @@ def main(): run = True while run: try: - process(events2, resources, services, status) + process(events2, resources, running_services, status) except KeyboardInterrupt: run = False except SigHupException: - show_status(services, status) - - clean_up(services) + show_status(running_services, status) + except Exception: + print('Unhandled exception: %s' % str(e)) + print('Exiting...') + stop_services(running_services) if __name__ == '__main__': main() From 65340daca39eba9616bf7f0ac248b2e96b77adf4 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Fri, 23 Sep 2022 17:45:08 +0200 Subject: [PATCH 075/133] feat(linstor-manager): add new `healthCheck` function to monitor pool (#26) Print a JSON output to monitor state of LINSTOR SRs: - Display nodes, storage pool and resources - Print human readable warns and errors Usage example: ``` xe host-call-plugin host-uuid=c96ec4dd-28ac-4df4-b73c-4371bd202728 plugin=linstor-manager fn=healthCheck args:groupName=linstor_group { "errors": [], "warns": [], "controller-uri": "linstor://172.16.210.14", "storage-pools": { "r620-s1": [ { "free-size": 999125155840, "storage-pool-name": "xcp-sr-linstor_group", "capacity": 1000203091968, "uuid": "994a5c45-ba52-4f17-8e46-74c7dec0a1e7" } ], "r620-s3": [ { "free-size": 999125155840, "storage-pool-name": "xcp-sr-linstor_group", "capacity": 1000203091968, "uuid": "ad78adad-a9f6-4513-9f96-e8eb8fe716dc" } ], "r620-s2": [ { "free-size": 999125155840, "storage-pool-name": "xcp-sr-linstor_group", "capacity": 1000203091968, "uuid": "f76048f9-8821-484b-9a51-670a49df7a6e" } ] }, "nodes": { "r620-s1": "ONLINE", "r620-s3": "ONLINE", "r620-s2": "ONLINE" }, "resources": { "xcp-persistent-database": { "r620-s1": { "tie-breaker": false, "in-use": true, "volumes": [ { "storage-pool-name": "xcp-sr-linstor_group", "uuid": "1a436f23-eb81-4a8f-8ab6-de317282b5d5", "device-path": "/dev/drbd1000", "number": 0, "disk-state": "UpToDate", "allocated-size": 1077936128, "usable-size": 1073741824 } ], "diskful": true }, "r620-s3": { "tie-breaker": false, "in-use": false, "volumes": [ { "storage-pool-name": "xcp-sr-linstor_group", "uuid": "31a05bd1-20b6-471a-86b9-bbcdccfaab96", "device-path": "/dev/drbd1000", "number": 0, "disk-state": "UpToDate", "allocated-size": 1077936128, "usable-size": 1073741824 } ], "diskful": true }, "r620-s2": { "tie-breaker": false, "in-use": false, "volumes": [ { "storage-pool-name": "xcp-sr-linstor_group", "uuid": "0420f252-9762-4063-bdd4-732e40373ffb", "device-path": "/dev/drbd1000", "number": 0, "disk-state": "UpToDate", "allocated-size": 1077936128, "usable-size": 1073741824 } ], "diskful": true } } } } ``` Signed-off-by: Ronan Abhamon --- drivers/linstor-manager | 116 +++++++++++++++++++++++++++++++- drivers/linstorvolumemanager.py | 105 +++++++++++++++++++++++++++++ 2 files changed, 220 insertions(+), 1 deletion(-) diff --git a/drivers/linstor-manager b/drivers/linstor-manager index 5485b900..7abc1054 100755 --- a/drivers/linstor-manager +++ b/drivers/linstor-manager @@ -848,6 +848,119 @@ def get_drbd_openers(session, args): raise +def health_check(session, args): + group_name = args['groupName'] + + result = { + 'controller-uri': '', + 'nodes': {}, + 'storage-pools': {}, + 'warnings': [], + 'errors': [] + } + + def format_result(): + return json.dumps(result) + + # 1. Get controller. + try: + controller_uri = get_controller_uri() + + result['controller-uri'] = controller_uri + try: + if controller_uri == 'linstor://localhost': + # Replace `localhost` with IP to give a better info for users. + result['controller-uri'] = 'linstor://' + util.get_this_host_address(session) + except Exception: + # Ignore error: can be a XAPI restart or something else. + pass + + linstor = LinstorVolumeManager( + controller_uri, + group_name, + logger=util.SMlog + ) + except Exception as e: + # Probably a network issue, or offline controller. + result['errors'].append('Cannot join SR: `{}`.'.format(e)) + return format_result() + + try: + # 2. Check node statuses. + nodes = linstor.get_nodes_info() + result['nodes'] = nodes + for node_name, status in nodes.items(): + if status != 'ONLINE': + result['warnings'].append('Node `{}` is {}.'.format(node_name, status)) + + # 3. Check storage pool statuses. + storage_pools_per_node = linstor.get_storage_pools_info() + result['storage-pools'] = storage_pools_per_node + for node_name, storage_pools in storage_pools_per_node.items(): + for storage_pool in storage_pools: + free_size = storage_pool['free-size'] + capacity = storage_pool['capacity'] + if free_size < 0 or capacity <= 0: + result['errors'].append( + 'Cannot get free size and/or capacity of storage pool `{}`.' + .format(storage_pool['uuid']) + ) + elif free_size > capacity: + result['errors'].append( + 'Free size of storage pool `{}` is greater than capacity.' + .format(storage_pool['uuid']) + ) + else: + remaining_percent = free_size / float(capacity) * 100.0 + threshold = 10.0 + if remaining_percent < threshold: + result['warnings'].append( + 'Remaining size of storage pool `{}` is below {}% of its capacity.' + .format(storage_pool['uuid'], threshold) + ) + + # 4. Check resource statuses. + all_resources = linstor.get_resources_info() + result['resources'] = all_resources + + for resource_name, resource_by_node in all_resources.items(): + for node_name, resource in resource_by_node.items(): + for volume_index, volume in enumerate(resource['volumes']): + disk_state = volume['disk-state'] + if disk_state in ['UpToDate', 'Created', 'Attached']: + continue + if disk_state == 'DUnknown': + result['warnings'].append( + 'Unknown state for volume `{}` at index {} for resource `{}` on node `{}`' + .format(volume['device-path'], volume_index, resource_name, node_name) + ) + continue + if disk_state in ['Inconsistent', 'Failed', 'To: Creating', 'To: Attachable', 'To: Attaching']: + result['errors'].append( + 'Invalid state `{}` for volume `{}` at index {} for resource `{}` on node `{}`' + .format(disk_state, volume['device-path'], volume_index, resource_name, node_name) + ) + continue + if disk_state == 'Diskless': + if resource['diskful']: + result['errors'].append( + 'Unintentional diskless state detected for volume `{}` at index {} for resource `{}` on node `{}`' + .format(volume['device-path'], volume_index, resource_name, node_name) + ) + elif resource['tie-breaker']: + volume['disk-state'] = 'TieBreaker' + continue + result['warnings'].append( + 'Unhandled state `{}` for volume `{}` at index {} for resource `{}` on node `{}`' + .format(disk_state, volume['device-path'], volume_index, resource_name, node_name) + ) + + except Exception as e: + result['errors'].append('Unexpected error: `{}`'.format(e)) + + return format_result() + + if __name__ == '__main__': XenAPIPlugin.dispatch({ 'prepareSr': prepare_sr, @@ -887,5 +1000,6 @@ if __name__ == '__main__': 'listDrbdVolumes': list_drbd_volumes, 'destroyDrbdVolume': destroy_drbd_volume, 'destroyDrbdVolumes': destroy_drbd_volumes, - 'getDrbdOpeners': get_drbd_openers + 'getDrbdOpeners': get_drbd_openers, + 'healthCheck': health_check }) diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py index 3ee5d248..efe5d53b 100755 --- a/drivers/linstorvolumemanager.py +++ b/drivers/linstorvolumemanager.py @@ -1402,6 +1402,111 @@ def destroy_node(self, node_name): 'Failed to destroy node `{}`: {}'.format(node_name, error_str) ) + def get_nodes_info(self): + """ + Get all nodes + statuses, used or not by the pool. + :rtype: dict(str, dict) + """ + try: + nodes = {} + for node in self._linstor.node_list_raise().nodes: + nodes[node.name] = node.connection_status + return nodes + except Exception as e: + raise LinstorVolumeManagerError( + 'Failed to get all nodes: `{}`'.format(e) + ) + + def get_storage_pools_info(self): + """ + Give all storage pools of current group name. + :rtype: dict(str, list) + """ + storage_pools = {} + for pool in self._get_storage_pools(force=True): + if pool.node_name not in storage_pools: + storage_pools[pool.node_name] = [] + + size = -1 + capacity = -1 + + space = pool.free_space + if space: + size = space.free_capacity + if size < 0: + size = -1 + else: + size *= 1024 + capacity = space.total_capacity + if capacity <= 0: + capacity = -1 + else: + capacity *= 1024 + + storage_pools[pool.node_name].append({ + 'storage-pool-name': pool.name, + 'uuid': pool.uuid, + 'free-size': size, + 'capacity': capacity + }) + + return storage_pools + + def get_resources_info(self): + """ + Give all resources of current group name. + :rtype: dict(str, list) + """ + resources = {} + resource_list = self._linstor.resource_list_raise() + for resource in resource_list.resources: + if resource.name not in resources: + resources[resource.name] = {} + + resources[resource.name][resource.node_name] = { + 'volumes': [], + 'diskful': linstor.consts.FLAG_DISKLESS not in resource.flags, + 'tie-breaker': linstor.consts.FLAG_TIE_BREAKER in resource.flags + } + + for volume in resource.volumes: + # We ignore diskless pools of the form "DfltDisklessStorPool". + if volume.storage_pool_name != self._group_name: + continue + + usable_size = volume.usable_size + if usable_size < 0: + usable_size = -1 + else: + usable_size *= 1024 + + allocated_size = volume.allocated_size + if allocated_size < 0: + allocated_size = -1 + else: + allocated_size *= 1024 + + resources[resource.name][resource.node_name]['volumes'].append({ + 'storage-pool-name': volume.storage_pool_name, + 'uuid': volume.uuid, + 'number': volume.number, + 'device-path': volume.device_path, + 'usable-size': usable_size, + 'allocated-size': allocated_size + }) + + for resource_state in resource_list.resource_states: + resource = resources[resource_state.rsc_name][resource_state.node_name] + resource['in-use'] = resource_state.in_use + + volumes = resource['volumes'] + for volume_state in resource_state.volume_states: + volume = next((x for x in volumes if x['number'] == volume_state.number), None) + if volume: + volume['disk-state'] = volume_state.disk_state + + return resources + @classmethod def create_sr( cls, group_name, node_names, ips, redundancy, From 6a0e3066eebad4da6fb6217e33b188059ec23641 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Tue, 4 Oct 2022 11:01:33 +0200 Subject: [PATCH 076/133] fix(LinstorSR): fix xha conf parsing => return host ip, not the UUID Signed-off-by: Ronan Abhamon --- drivers/LinstorSR.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index 374d6cb9..d32771fe 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -277,7 +277,7 @@ def get_ips_from_xha_config_file(): return IPS_XHA_CACHE ips = dict() - host_ip = None + host_id = None try: # Ensure there is no dirty read problem. # For example if the HA is reloaded. @@ -287,7 +287,7 @@ def get_ips_from_xha_config_file(): period=1 ) except: - return (host_ip, ips) + return (None, ips) def parse_host_nodes(ips, node): current_id = None @@ -322,14 +322,14 @@ def parse_local_config(ips, node): if node.tag == 'common-config': parse_common_config(ips, node) elif node.tag == 'local-config': - host_ip = parse_local_config(ips, node) + host_id = parse_local_config(ips, node) else: continue - if ips and host_ip: + if ips and host_id: break - return (host_ip, ips) + return (host_id and ips.get(host_id), ips) def activate_lvm_group(group_name): From 067f07226a2dde8528f546c1edfdddb99e1e96e7 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Tue, 4 Oct 2022 18:48:09 +0200 Subject: [PATCH 077/133] fix(LinstorSR): start correctly HA servers (HTTP/NBD) after reboot Use a timeout call after a reboot to get a XAPI session because it can be impossible to initialize the connection at startup or if the current host has been ejected (in this case the call can block indefinitely). Signed-off-by: Ronan Abhamon --- drivers/LinstorSR.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index d32771fe..ae253858 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -2513,7 +2513,10 @@ def _start_persistent_http_server(volume_name): port = '8077' try: - session = util.get_localAPI_session() + # Use a timeout call because XAPI may be unusable on startup + # or if the host has been ejected. So in this case the call can + # block indefinitely. + session = util.timeout_call(5, util.get_localAPI_session) host_ip = util.get_this_host_address(session) except: # Fallback using the XHA file if session not available. @@ -2583,7 +2586,7 @@ def _start_persistent_nbd_server(self, volume_name): port = '8077' try: - session = util.get_localAPI_session() + session = util.timeout_call(5, util.get_localAPI_session) ips = util.get_host_addresses(session) except Exception as e: _, ips = get_ips_from_xha_config_file() From 81956c30d5c3f69d6034ae81e0042367465f39c0 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Wed, 5 Oct 2022 10:45:50 +0200 Subject: [PATCH 078/133] fix(linstorvolumemanager): use an array to store diskful volumes info Otherwise the `is_diskful` attr only reflects the info of one host after a call to `get_volume_info`... And we therefore lose this information concerning the others. Signed-off-by: Ronan Abhamon --- drivers/LinstorSR.py | 4 +++- drivers/linstorvolumemanager.py | 9 +++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index ae253858..b8558077 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -39,6 +39,7 @@ import re import scsiutil import signal +import socket import SR import SRCommand import subprocess @@ -2729,7 +2730,8 @@ def _attach_using_http_nbd(self): .format(self.uuid, e) ) - must_get_device_path = volume_info.is_diskful + hostname = socket.gethostname() + must_get_device_path = hostname in volume_info.diskful drbd_path = None if must_get_device_path or self.sr._is_master: diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py index efe5d53b..e577f63c 100755 --- a/drivers/linstorvolumemanager.py +++ b/drivers/linstorvolumemanager.py @@ -307,19 +307,19 @@ class VolumeInfo(object): 'allocated_size', # Allocated size, place count is not used. 'virtual_size', # Total virtual available size of this volume # (i.e. the user size at creation). - 'is_diskful' + 'diskful' # Array of nodes that have a diskful volume. ) def __init__(self, name): self.name = name self.allocated_size = 0 self.virtual_size = 0 - self.is_diskful = False + self.diskful = [] def __repr__(self): return 'VolumeInfo("{}", {}, {}, {})'.format( self.name, self.allocated_size, self.virtual_size, - 'diskful' if self.is_diskful else 'diskless' + self.diskful ) # -------------------------------------------------------------------------- @@ -1878,7 +1878,8 @@ def _get_volumes_info(self, volume_name=None): else: current = all_volume_info[resource.name] - current.is_diskful = linstor.consts.FLAG_DISKLESS not in resource.flags + if linstor.consts.FLAG_DISKLESS not in resource.flags: + current.diskful.append(resource.node_name) for volume in resource.volumes: # We ignore diskless pools of the form "DfltDisklessStorPool". From e38f4dbd5529f8b8d55c8fdd289e60b57542f7b6 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Thu, 6 Oct 2022 17:54:10 +0200 Subject: [PATCH 079/133] feat(linstorvolumemanager): support snaps when a host is offline - Don't create diskless volumes during clone, delay it. Signed-off-by: Ronan Abhamon --- drivers/linstorvolumemanager.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py index e577f63c..09aad42f 100755 --- a/drivers/linstorvolumemanager.py +++ b/drivers/linstorvolumemanager.py @@ -1199,12 +1199,6 @@ def find_best_nodes(): rsc_name=clone_volume_name, storage_pool=self._group_name )) - for node_name in diskless_node_names: - resources.append(linstor.ResourceData( - node_name=node_name, - rsc_name=clone_volume_name, - diskless=True - )) # 5. Create resources! def clean(): From f04a4d7672c508c58ad47a948ac61f09c878fb7d Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Fri, 7 Oct 2022 17:18:37 +0200 Subject: [PATCH 080/133] fix(linstorvolumemanager): support offline hosts when plugins are called - Robustify plugin calls - Add a timeout on session getter Signed-off-by: Ronan Abhamon --- drivers/linstorvolumemanager.py | 69 +++++++++++++++++++++++++-------- 1 file changed, 52 insertions(+), 17 deletions(-) diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py index 09aad42f..58c02382 100755 --- a/drivers/linstorvolumemanager.py +++ b/drivers/linstorvolumemanager.py @@ -83,15 +83,30 @@ def get_all_volume_openers(resource_name, volume): volume = str(volume) openers = {} - session = util.get_localAPI_session() + # Make sure this call never stucks because this function can be called + # during HA init and in this case we can wait forever. + session = util.timeout_call(10, util.get_localAPI_session) + hosts = session.xenapi.host.get_all_records() for host_ref, host_record in hosts.items(): - openers[host_record['hostname']] = json.loads( - session.xenapi.host.call_plugin(host_ref, PLUGIN, PLUGIN_CMD, { - 'resourceName': resource_name, - 'volume': volume - }) - ) + node_name = host_record['hostname'] + try: + if not session.xenapi.host_metrics.get_record( + host_record['metrics'] + )['live']: + # Ensure we call plugin on online hosts only. + continue + + openers[node_name] = json.loads( + session.xenapi.host.call_plugin(host_ref, PLUGIN, PLUGIN_CMD, { + 'resourceName': resource_name, + 'volume': volume + }) + ) + except Exception as e: + util.SMlog('Failed to get openers of `{}` on `{}`: {}'.format( + resource_name, node_name, e + )) return openers @@ -162,12 +177,20 @@ def _get_controller_uri(): # replicated volume. `drbdadm status xcp-persistent-database` returns # 3 connections by default. try: - session = util.get_localAPI_session() + session = util.timeout_call(10, util.get_localAPI_session) + for host_ref, host_record in session.xenapi.host.get_all_records().items(): - if distutils.util.strtobool( - session.xenapi.host.call_plugin(host_ref, PLUGIN, PLUGIN_CMD, {}) - ): - return 'linstor://' + host_record['address'] + node_name = host_record['hostname'] + try: + if distutils.util.strtobool( + session.xenapi.host.call_plugin(host_ref, PLUGIN, PLUGIN_CMD, {}) + ): + return 'linstor://' + host_record['address'] + except Exception as e: + # Can throw and exception if a host is offline. So catch it. + util.SMlog('Unable to search controller on `{}`: {}'.format( + node_name, e + )) except: # Not found, maybe we are trying to create the SR... pass @@ -200,12 +223,24 @@ def get_controller_node_name(): if res: return res.groups()[0] - session = util.get_localAPI_session() + session = util.timeout_call(5, util.get_localAPI_session) + for host_ref, host_record in session.xenapi.host.get_all_records().items(): - if distutils.util.strtobool( - session.xenapi.host.call_plugin(host_ref, PLUGIN, PLUGIN_CMD, {}) - ): - return host_record['hostname'] + node_name = host_record['hostname'] + try: + if not session.xenapi.host_metrics.get_record( + host_record['metrics'] + )['live']: + continue + + if distutils.util.strtobool(session.xenapi.host.call_plugin( + host_ref, PLUGIN, PLUGIN_CMD, {} + )): + return node_name + except Exception as e: + util.SMlog('Failed to call plugin to get controller on `{}`: {}'.format( + node_name, e + )) # ============================================================================== From bfb97cb543f59316d02452506fb1daac60984b43 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Fri, 7 Oct 2022 17:45:26 +0200 Subject: [PATCH 081/133] fix(linstorvolumemanager): define _base_group_name member at SR creation Signed-off-by: Ronan Abhamon --- drivers/linstorvolumemanager.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py index 58c02382..d19effbe 100755 --- a/drivers/linstorvolumemanager.py +++ b/drivers/linstorvolumemanager.py @@ -1622,6 +1622,7 @@ def _create_sr( ) driver_pool_name = group_name + base_group_name = group_name group_name = cls._build_group_name(group_name) pools = lin.storage_pool_list_raise(filter_by_stor_pools=[group_name]) pools = pools.storage_pools @@ -1784,6 +1785,7 @@ def _create_sr( instance._linstor = lin instance._logger = logger instance._redundancy = redundancy + instance._base_group_name = base_group_name instance._group_name = group_name instance._volumes = set() instance._storage_pools_time = 0 From 459bef9e56d2b2d4d6e2a9d4f4e19287f6b0ae07 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Mon, 10 Oct 2022 14:33:24 +0200 Subject: [PATCH 082/133] feat(linstorvhdutil): modify logic of local vhdutil calls - Always log openers when we can't call vhdutil locally - Remove "raise" lines after a local fail, always retry command calls on a remote Signed-off-by: Ronan Abhamon --- drivers/linstorvhdutil.py | 72 ++++++++++++++++++++++++--------------- 1 file changed, 44 insertions(+), 28 deletions(-) diff --git a/drivers/linstorvhdutil.py b/drivers/linstorvhdutil.py index 4d031e12..2687cadf 100644 --- a/drivers/linstorvhdutil.py +++ b/drivers/linstorvhdutil.py @@ -65,12 +65,17 @@ def wrapper(*args, **kwargs): (node_names, in_use) = \ self._linstor.find_up_to_date_diskful_nodes(vdi_uuid) + local_e = None try: if not in_use or socket.gethostname() in node_names: - return self._call_local_vhd_util(local_method, device_path, *args[2:], **kwargs) - except util.CommandException as e: - if e.code != errno.EROFS and e.code != EMEDIUMTYPE: - raise + # Don't call `_call_local_vhd_util`, we don't want to + # trace failed calls now using opener files. It can be + # normal to have an exception. + def local_call(): + return local_method(device_path, *args[2:], **kwargs) + return util.retry(local_call, 5, 2) + except util.CommandException as local_e: + self._handle_local_vhd_util_error(local_e) # B. Execute the plugin on master or slave. remote_args = { @@ -80,12 +85,13 @@ def wrapper(*args, **kwargs): remote_args.update(**kwargs) remote_args = {str(key): str(value) for key, value in remote_args.iteritems()} - def remote_call(): - host_ref = self._get_readonly_host( - vdi_uuid, device_path, node_names - ) - return call_vhd_util_on_host(self._session, host_ref, remote_method, device_path, remote_args) - response = util.retry(remote_call, 5, 2) + try: + def remote_call(): + host_ref = self._get_readonly_host(vdi_uuid, device_path, node_names) + return call_vhd_util_on_host(self._session, host_ref, remote_method, device_path, remote_args) + response = util.retry(remote_call, 5, 2) + except Exception as remote_e: + self._raise_openers_exception(device_path, local_e or remote_e) return response_parser(self, vdi_uuid, response) return wrapper @@ -271,22 +277,13 @@ def _get_readonly_host(self, vdi_uuid, device_path, node_names): # -------------------------------------------------------------------------- - def _call_local_vhd_util(self, local_method, device_path, *args, **kwargs): - try: - def local_call(): - return local_method(device_path, *args, **kwargs) - return util.retry(local_call, 5, 2) - except util.CommandException as e: - if e.code != errno.EROFS and e.code != EMEDIUMTYPE: - raise - - # Volume is locked on a host, find openers. + def _raise_openers_exception(self, device_path, e): e_with_openers = None try: volume_uuid = self._linstor.get_volume_uuid_from_device_path( device_path ) - e_with_openers = util.CommandException( + e_wrapper = util.CommandException( e.code, e.cmd, e.reason + ' (openers: {})'.format( @@ -294,12 +291,29 @@ def local_call(): ) ) except Exception as illformed_e: - raise util.CommandException( + e_wrapper = util.CommandException( e.code, e.cmd, e.reason + ' (unable to get openers: {})'.format(illformed_e) ) - raise e_with_openers # pylint: disable = E0702 + util.SMlog('raise opener exception: {} ({})'.format(e_wrapper, e_wrapper.reason)) + raise e_wrapper # pylint: disable = E0702 + + @staticmethod + def _handle_local_vhd_util_error(e): + if e.code != errno.EROFS and e.code != EMEDIUMTYPE: + util.SMlog('failed to execute locally vhd-util (sys {})'.format(e.code)) + + def _call_local_vhd_util(self, local_method, device_path, *args, **kwargs): + try: + def local_call(): + return local_method(device_path, *args, **kwargs) + return util.retry(local_call, 5, 2) + except util.CommandException as e: + self._handle_local_vhd_util_error(e) + + # Volume is locked on a host, find openers. + self._raise_openers_exception(device_path, e) def _call_vhd_util(self, local_method, remote_method, device_path, *args, **kwargs): # A. Try to write locally... @@ -308,8 +322,7 @@ def local_call(): return local_method(device_path, *args, **kwargs) return util.retry(local_call, 5, 2) except util.CommandException as e: - if e.code != errno.EROFS and e.code != EMEDIUMTYPE: - raise + self._handle_local_vhd_util_error(e) # B. Execute the command on another host. # B.1. Get host list. @@ -362,11 +375,14 @@ def remote_call(): pass if no_host_found: - return local_method(device_path, *args, **kwargs) + try: + return local_method(device_path, *args, **kwargs) + except Exception as e: + self._raise_openers_exception(device_path, e) raise xs_errors.XenError( 'VDIUnavailable', - opterr='No valid host found to run vhd-util command `{}` (path={}): {}' - .format(remote_method, device_path, e) + opterr='No valid host found to run vhd-util command `{}` (path=`{}`, openers=`{}`): {}' + .format(remote_method, device_path, openers, e) ) return util.retry(remote_call, 5, 2) From 94a1fa46d23b0b7b7989a25a850b8c93fe7082bf Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Mon, 17 Oct 2022 18:14:16 +0200 Subject: [PATCH 083/133] fix(linstorvolumemanager): robustify failed snapshots - Ensure we can always rename a failed snap, so we must check if we have metadata in the KV-store. Otherwise an error is triggered because we are trying to copy a None object. - If we can't delete a volume, rename it with a DELETED_ suffix, it's mandatory in specific cases to rollback snapshot transactions. Signed-off-by: Ronan Abhamon --- drivers/linstorvolumemanager.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py index d19effbe..44b247e8 100755 --- a/drivers/linstorvolumemanager.py +++ b/drivers/linstorvolumemanager.py @@ -951,7 +951,11 @@ def update_volume_uuid(self, volume_uuid, new_volume_uuid, force=False): volume_properties[self.PROP_UPDATING_UUID_SRC] = volume_uuid # 4. Copy the properties. - volume_properties[self.PROP_METADATA] = metadata + # Note: On new volumes, during clone for example, the metadata + # may be missing. So we must test it to avoid this error: + # "None has to be a str/unicode, but is " + if metadata: + volume_properties[self.PROP_METADATA] = metadata volume_properties[self.PROP_VOLUME_NAME] = volume_name # 5. Ok! @@ -2289,6 +2293,16 @@ def _build_volumes(self, repair): 'Cannot clean volume {}: {}'.format(volume_uuid, e) ) + # The volume can't be removed, maybe it's still in use, + # in this case rename it with the "DELETED_" prefix. + # This prefix is mandatory if it exists a snap transaction to + # rollback because the original VDI UUID can try to be renamed + # with the UUID we are trying to delete... + if not volume_uuid.startswith('DELETED_'): + self.update_volume_uuid( + volume_uuid, 'DELETED_' + volume_uuid, force=True + ) + for dest_uuid, src_uuid in updating_uuid_volumes.items(): dest_namespace = self._build_volume_namespace(dest_uuid) From b1f91641aa87732172b14c5f16bd290fa411060d Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Tue, 8 Nov 2022 17:31:45 +0100 Subject: [PATCH 084/133] fix(linstorvolumemanager): use a namespace for volumes - This change is not compatible with existing LINSTOR SR instances! Signed-off-by: Ronan Abhamon --- drivers/linstorvolumemanager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py index 44b247e8..8c253d49 100755 --- a/drivers/linstorvolumemanager.py +++ b/drivers/linstorvolumemanager.py @@ -308,7 +308,7 @@ class LinstorVolumeManager(object): # Property namespaces. NAMESPACE_SR = 'xcp/sr' - NAMESPACE_VOLUME = 'volume' + NAMESPACE_VOLUME = 'xcp/volume' # Regex to match properties. REG_PROP = '^([^/]+)/{}$' From 4d019915456c1cde66d4483cbfd5df71fb0608d7 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Mon, 14 Nov 2022 17:18:48 +0100 Subject: [PATCH 085/133] feat(linstor-kv-dump): rename to linstor-kv-tool + add remove volume helpers --- Makefile | 2 +- scripts/{linstor-kv-dump => linstor-kv-tool} | 42 +++++++++++++++++++- 2 files changed, 42 insertions(+), 2 deletions(-) rename scripts/{linstor-kv-dump => linstor-kv-tool} (51%) diff --git a/Makefile b/Makefile index d0f043f5..c38d5407 100755 --- a/Makefile +++ b/Makefile @@ -239,7 +239,7 @@ install: precheck install -m 755 drivers/fcoelib.py $(SM_STAGING)$(SM_DEST) mkdir -p $(SM_STAGING)$(LIBEXEC) install -m 755 scripts/fork-log-daemon $(SM_STAGING)$(LIBEXEC) - install -m 755 scripts/linstor-kv-dump $(SM_STAGING)$(BIN_DEST) + install -m 755 scripts/linstor-kv-tool $(SM_STAGING)$(BIN_DEST) install -m 755 scripts/local-device-change $(SM_STAGING)$(LIBEXEC) install -m 755 scripts/check-device-sharing $(SM_STAGING)$(LIBEXEC) install -m 755 scripts/usb_change $(SM_STAGING)$(LIBEXEC) diff --git a/scripts/linstor-kv-dump b/scripts/linstor-kv-tool similarity index 51% rename from scripts/linstor-kv-dump rename to scripts/linstor-kv-tool index 93598d7c..128d8992 100755 --- a/scripts/linstor-kv-dump +++ b/scripts/linstor-kv-tool @@ -18,6 +18,7 @@ import argparse import json import linstor + def dump_kv(controller_uri, group_name, namespace): kv = linstor.KV( group_name, @@ -26,13 +27,52 @@ def dump_kv(controller_uri, group_name, namespace): ) print(json.dumps(kv, sort_keys=True, indent=2)) + +def remove_volume(controller_uri, group_name, vdi_name): + assert vdi_name + kv = linstor.KV( + group_name, + uri=controller_uri, + namespace='/xcp/volume/{}'.format(vdi_name) + ) + + for key, value in list(kv.items()): + del kv[key] + + +def remove_all_volumes(controller_uri, group_name): + kv = linstor.KV( + group_name, + uri=controller_uri, + namespace='/' + ) + + for key, value in list(kv.items()): + if key.startswith('xcp/volume/'): + size = key.rindex('/') + kv.namespace = key[:size] + del kv[key[size + 1:]] + + def main(): parser = argparse.ArgumentParser() parser.add_argument('-u', '--uri', required=True) parser.add_argument('-g', '--group-name', required=True) parser.add_argument('-n', '--namespace', default='/') + + action = parser.add_mutually_exclusive_group(required=True) + action.add_argument('--dump-volumes', action='store_true') + action.add_argument('--remove-volume', metavar='VDI_UUID') + action.add_argument('--remove-all-volumes', action='store_true') + args = parser.parse_args() - dump_kv(args.uri, args.group_name, args.namespace) + if args.dump_volumes: + dump_kv(args.uri, args.group_name, args.namespace) + elif args.remove_volume: + remove_volume(args.uri, args.group_name, args.remove_volume) + elif args.remove_all_volumes: + remove_all_volumes(args.uri, args.group_name) + if __name__ == '__main__': main() From d1314a71ed766c5fcc0ab91dd9570e66b8b22f07 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Wed, 16 Nov 2022 12:12:12 +0100 Subject: [PATCH 086/133] fix(LinstorSR): handle correctly localhost during start/stop of minidrbdcluster Otherwise another controller can be started during `xe sr-destroy` call. And in this case we can get an exception during the umount/remount of /var/lib/linstor to destroy the persistent database because a controller uses it. Signed-off-by: Ronan Abhamon --- drivers/LinstorSR.py | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index b8558077..ba284524 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -967,25 +967,48 @@ def _update_minidrbdcluster(self, host, enabled): def _update_minidrbdcluster_on_all_hosts( self, enabled, controller_node_name=None ): + if controller_node_name == 'localhost': + controller_node_name = self.session.xenapi.host.get_record( + util.get_this_host_ref(self.session) + )['hostname'] + assert controller_node_name + assert controller_node_name != 'localhost' + controller_host = None secondary_hosts = [] hosts = self.session.xenapi.host.get_all_records() for host_ref, host_rec in hosts.iteritems(): - if controller_node_name == host_rec['hostname']: + hostname = host_rec['hostname'] + if controller_node_name == hostname: controller_host = host_ref else: - secondary_hosts.append(host_ref) + secondary_hosts.append((host_ref, hostname)) + + action_name = 'Starting' if enabled else 'Stopping' + if controller_node_name and not controller_host: + util.SMlog('Failed to find controller host: `{}`'.format( + controller_node_name + )) if enabled and controller_host: + util.SMlog('{} minidrbdcluster on controller host `{}`...'.format( + action_name, controller_node_name + )) # If enabled is true, we try to start the controller on the desired # node name first. self._update_minidrbdcluster(controller_host, enabled) - for host in secondary_hosts: - self._update_minidrbdcluster(host, enabled) + for host_ref, hostname in secondary_hosts: + util.SMlog('{} minidrbdcluster on host {}...'.format( + action_name, hostname + )) + self._update_minidrbdcluster(host_ref, enabled) if not enabled and controller_host: + util.SMlog('{} minidrbdcluster on controller host `{}`...'.format( + action_name, controller_node_name + )) # If enabled is false, we disable the minidrbdcluster service of # the controller host last. Why? Otherwise the linstor-controller # of other nodes can be started, and we don't want that. From 7aa139ccb966c84bff6321403277d700424361a6 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Thu, 17 Nov 2022 15:43:25 +0100 Subject: [PATCH 087/133] fix(cleanup.py): call repair on another host when EROFS is returned (DRBD) Signed-off-by: Ronan Abhamon --- drivers/cleanup.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/drivers/cleanup.py b/drivers/cleanup.py index 9d55baac..bf0cc46e 100755 --- a/drivers/cleanup.py +++ b/drivers/cleanup.py @@ -726,6 +726,12 @@ def delete(self): lock.Lock.cleanupAll(self.uuid) self._clear() + def getParent(self): + return vhdutil.getParent(self.path, lambda x: x.strip()) + + def repair(self, parent): + vhdutil.repair(parent) + def __str__(self): strHidden = "" if self.hidden: @@ -876,11 +882,11 @@ def _coalesceVHD(self, timeOut): # Try a repair and reraise the exception parent = "" try: - parent = vhdutil.getParent(self.path, lambda x: x.strip()) + parent = self.getParent() # Repair error is logged and ignored. Error reraised later util.SMlog('Coalesce failed on %s, attempting repair on ' \ 'parent %s' % (self.uuid, parent)) - vhdutil.repair(parent) + self.repair(parent) except Exception, e: util.SMlog('(error ignored) Failed to repair parent %s ' \ 'after failed coalesce on %s, err: %s' % @@ -1442,6 +1448,16 @@ def pause(self, failfast=False): def coalesce(self): self.sr._vhdutil.force_coalesce(self.path) + def getParent(self): + return self.sr._vhdutil.get_parent( + self.sr._linstor.get_volume_uuid_from_device_path(self.path) + ) + + def repair(self, parent_uuid): + self.sr._vhdutil.force_repair( + self.sr._linstor.get_device_path(parent_uuid) + ) + def _relinkSkip(self): abortFlag = IPCFlag(self.sr.uuid) for child in self.children: From 7ed64317c3e76e77e92ad599210df984e9f3a6df Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Thu, 17 Nov 2022 15:46:02 +0100 Subject: [PATCH 088/133] fix(LinstorSR): avoid introduction of DELETED volumes Signed-off-by: Ronan Abhamon --- drivers/LinstorSR.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index ba284524..9e5b3cda 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -1170,6 +1170,9 @@ def _load_vdis_ex(self): ) continue + if vdi_uuid.startswith('DELETED_'): + continue + util.SMlog( 'Trying to introduce VDI {} as it is present in ' 'LINSTOR and not in XAPI...' From 54b630ee3360094d704401e948e836e9832194a1 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Fri, 18 Nov 2022 10:40:58 +0100 Subject: [PATCH 089/133] feat(linstor-kv-tool): remove-all-volumes supports journals now Not yet supported for remove-volume, not sure about the consequences for the moment. Signed-off-by: Ronan Abhamon --- scripts/linstor-kv-tool | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/linstor-kv-tool b/scripts/linstor-kv-tool index 128d8992..c9070270 100755 --- a/scripts/linstor-kv-tool +++ b/scripts/linstor-kv-tool @@ -48,7 +48,7 @@ def remove_all_volumes(controller_uri, group_name): ) for key, value in list(kv.items()): - if key.startswith('xcp/volume/'): + if key.startswith('xcp/volume/') or key.startswith('xcp/sr/journal/'): size = key.rindex('/') kv.namespace = key[:size] del kv[key[size + 1:]] From ce85d028941ff3447790a9925847c5170fe911fd Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Wed, 23 Nov 2022 15:26:51 +0100 Subject: [PATCH 090/133] fix(linstorvhdutil): due to bad refactoring, check call was broken Signed-off-by: Ronan Abhamon --- drivers/linstorvhdutil.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/linstorvhdutil.py b/drivers/linstorvhdutil.py index 2687cadf..a883ca4d 100644 --- a/drivers/linstorvhdutil.py +++ b/drivers/linstorvhdutil.py @@ -124,11 +124,11 @@ def check(self, vdi_uuid, ignore_missing_footer=False, fast=False): 'ignoreMissingFooter': str(ignore_missing_footer), 'fast': str(fast) } - return self._check(vdi_uuid, **kwargs) + return self._check(vdi_uuid, **kwargs) # pylint: disable = E1123 @linstorhostcall(vhdutil.check, 'check') - def _check(self, vdi_uuid, **kwargs): - return distutils.util.strtobool(kwargs['response']) + def _check(self, vdi_uuid, response): + return distutils.util.strtobool(response) def get_vhd_info(self, vdi_uuid, include_parent=True): kwargs = {'includeParent': str(include_parent)} From e29163c71c783beaa929928385674186e9d58f50 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Wed, 23 Nov 2022 15:28:23 +0100 Subject: [PATCH 091/133] feat(linstorvhdutil): ensure we use VHD parent to find host where to coalesce Signed-off-by: Ronan Abhamon --- drivers/linstorvhdutil.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/drivers/linstorvhdutil.py b/drivers/linstorvhdutil.py index a883ca4d..c2e9665f 100644 --- a/drivers/linstorvhdutil.py +++ b/drivers/linstorvhdutil.py @@ -228,15 +228,15 @@ def force_parent(self, path, parentPath, parentRaw=False): 'parentPath': str(parentPath), 'parentRaw': parentRaw } - return self._call_vhd_util(vhdutil.setParent, 'setParent', path, **kwargs) + return self._call_vhd_util(vhdutil.setParent, 'setParent', path, use_parent=False, **kwargs) @linstormodifier() def force_coalesce(self, path): - return self._call_vhd_util(vhdutil.coalesce, 'coalesce', path) + return self._call_vhd_util(vhdutil.coalesce, 'coalesce', path, use_parent=True) @linstormodifier() def force_repair(self, path): - return self._call_vhd_util(vhdutil.repair, 'repair', path) + return self._call_vhd_util(vhdutil.repair, 'repair', path, use_parent=False) # -------------------------------------------------------------------------- # Helpers. @@ -315,7 +315,12 @@ def local_call(): # Volume is locked on a host, find openers. self._raise_openers_exception(device_path, e) - def _call_vhd_util(self, local_method, remote_method, device_path, *args, **kwargs): + def _call_vhd_util(self, local_method, remote_method, device_path, use_parent, *args, **kwargs): + # Note: `use_parent` exists to know if the VHD parent is used by the local/remote method. + # Normally in case of failure, if the parent is unused we try to execute the method on + # another host using the DRBD opener list. In the other case, if the parent is required, + # we must check where this last one is open instead of the child. + # A. Try to write locally... try: def local_call(): @@ -346,11 +351,16 @@ def local_call(): volume_uuid = self._linstor.get_volume_uuid_from_device_path( device_path ) + parent_volume_uuid = None + if use_parent: + parent_volume_uuid = self.get_parent(volume_uuid) + + openers_uuid = parent_volume_uuid if use_parent else volume_uuid # B.3. Call! def remote_call(): try: - all_openers = self._linstor.get_volume_openers(volume_uuid) + all_openers = self._linstor.get_volume_openers(openers_uuid) except Exception as e: raise xs_errors.XenError( 'VDIUnavailable', From 4727679136ed1c2b6f825f7c3246f72d86fbaae6 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Mon, 5 Dec 2022 18:40:11 +0100 Subject: [PATCH 092/133] feat(linstorvolumemanager): force DRBD demote after failed volume creation/clone Signed-off-by: Ronan Abhamon --- drivers/linstor-manager | 13 ++++ drivers/linstorvolumemanager.py | 101 ++++++++++++++++++++++++-------- 2 files changed, 91 insertions(+), 23 deletions(-) diff --git a/drivers/linstor-manager b/drivers/linstor-manager index 7abc1054..5c4c5c90 100755 --- a/drivers/linstor-manager +++ b/drivers/linstor-manager @@ -804,6 +804,18 @@ def create_sr(session, args): raise +def demote_drbd_resource(session, args): + try: + resource_name = args['resource_name'] + (ret, stdout, stderr) = util.doexec(['drbdsetup', 'secondary', resource_name]) + if ret: + raise Exception('Failed to demote resource: {}'.format(stderr)) + return str(True) + except Exception as e: + util.SMlog('linstor-manager:demote_drbd_resource error: {}'.format(e)) + return str(False) + + def list_drbd_volumes(session, args): try: volume_group = args.get('volume_group') @@ -998,6 +1010,7 @@ if __name__ == '__main__': 'removeHost': remove_host, 'createSr': create_sr, 'listDrbdVolumes': list_drbd_volumes, + 'demoteDrbdResource': demote_drbd_resource, 'destroyDrbdVolume': destroy_drbd_volume, 'destroyDrbdVolumes': destroy_drbd_volumes, 'getDrbdOpeners': get_drbd_openers, diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py index 8c253d49..2e2feb23 100755 --- a/drivers/linstorvolumemanager.py +++ b/drivers/linstorvolumemanager.py @@ -242,6 +242,29 @@ def get_controller_node_name(): node_name, e )) + +def demote_drbd_resource(node_name, resource_name): + PLUGIN_CMD = 'demoteDrbdResource' + + session = util.timeout_call(5, util.get_localAPI_session) + + for host_ref, host_record in session.xenapi.host.get_all_records().items(): + if host_record['hostname'] != node_name: + continue + + try: + session.xenapi.host.call_plugin( + host_ref, PLUGIN, PLUGIN_CMD, {'resource_name': resource_name} + ) + except Exception as e: + util.SMlog('Failed to demote resource `{}` on `{}`: {}'.format( + resource_name, node_name, e + )) + raise Exception( + 'Can\'t demote resource `{}`, unable to find node `{}`' + .format(resource_name, node_name) + ) + # ============================================================================== class LinstorVolumeManagerError(Exception): @@ -615,6 +638,7 @@ def create_volume( no_diskless=no_diskless ) + # Volume created! Now try to find the device path. try: self._logger( 'Find device path of LINSTOR volume {}...'.format(volume_uuid) @@ -627,8 +651,10 @@ def create_volume( 'LINSTOR volume {} created!'.format(volume_uuid) ) return device_path - except Exception: - self._force_destroy_volume(volume_uuid) + except Exception as e: + # There is an issue to find the path. + # At this point the volume has just been created, so force flag can be used. + self._destroy_volume(volume_uuid, force=True) raise def mark_volume_as_persistent(self, volume_uuid): @@ -1242,7 +1268,7 @@ def find_best_nodes(): # 5. Create resources! def clean(): try: - self._destroy_volume(clone_uuid) + self._destroy_volume(clone_uuid, force=True) except Exception as e: self._logger( 'Unable to destroy volume {} after shallow clone fail: {}' @@ -1250,12 +1276,16 @@ def clean(): ) def create(): - try: - volume_properties = self._create_volume_with_properties( - clone_uuid, clone_volume_name, size, - place_resources=False - ) + # Note: placed outside try/except block because we create only definition first. + # There is no reason to call `clean` before the real resource creation. + volume_properties = self._create_volume_with_properties( + clone_uuid, clone_volume_name, size, + place_resources=False + ) + # After this point, `clean` can be called for any fail because the clone UUID + # is really unique. No risk to remove existing data. + try: result = self._linstor.resource_create(resources) error_str = self._get_error_str(result) if error_str: @@ -1298,6 +1328,7 @@ def remove_resourceless_volumes(self): resource_names = self._fetch_resource_names() for volume_uuid, volume_name in self.get_volumes_with_name().items(): if not volume_name or volume_name not in resource_names: + # Don't force, we can be sure of what's happening. self.destroy_volume(volume_uuid) def destroy(self): @@ -2064,7 +2095,7 @@ def create_definition(): # B.2. Create volume! def clean(): try: - self._destroy_volume(volume_uuid) + self._destroy_volume(volume_uuid, force=True) except Exception as e: self._logger( 'Unable to destroy volume {} after creation fail: {}' @@ -2136,7 +2167,7 @@ def _create_volume_with_properties( # It can only happen if the same volume uuid is used in the same # call in another host. if e.code != LinstorVolumeManagerError.ERR_VOLUME_EXISTS: - self._force_destroy_volume(volume_uuid) + self._destroy_volume(volume_uuid, force=True) raise def _find_device_path(self, volume_uuid, volume_name): @@ -2184,22 +2215,52 @@ def _request_device_path(self, volume_uuid, volume_name, activate=False): # Contains a path of the /dev/drbd form. return resources[0].volumes[0].device_path - def _destroy_resource(self, resource_name): - self._mark_resource_cache_as_dirty() + def _destroy_resource(self, resource_name, force=False): result = self._linstor.resource_dfn_delete(resource_name) error_str = self._get_error_str(result) - if error_str: + if not error_str: + self._mark_resource_cache_as_dirty() + return + + if not force: + self._mark_resource_cache_as_dirty() raise LinstorVolumeManagerError( - 'Could not destroy resource `{}` from SR `{}`: {}' + 'Could not destroy resource `{}` from SR `{}`: {}' .format(resource_name, self._group_name, error_str) ) - def _destroy_volume(self, volume_uuid): + # If force is used, ensure there is no opener. + all_openers = get_all_volume_openers(resource_name, '0') + for openers in all_openers.itervalues(): + if openers: + self._mark_resource_cache_as_dirty() + raise LinstorVolumeManagerError( + 'Could not force destroy resource `{}` from SR `{}`: {} (openers=`{}`)' + .format(resource_name, self._group_name, error_str, all_openers) + ) + + # Maybe the resource is blocked in primary mode. DRBD/LINSTOR issue? + resource_states = filter( + lambda resource_state: resource_state.name == resource_name, + self._get_resource_cache().resource_states + ) + + # Mark only after computation of states. + self._mark_resource_cache_as_dirty() + + for resource_state in resource_states: + volume_state = resource_state.volume_states[0] + if resource_state.in_use: + demote_drbd_resource(resource_state.node_name, resource_name) + break + self._destroy_resource(resource_name) + + def _destroy_volume(self, volume_uuid, force=False): volume_properties = self._get_volume_properties(volume_uuid) try: volume_name = volume_properties.get(self.PROP_VOLUME_NAME) if volume_name in self._fetch_resource_names(): - self._destroy_resource(volume_name) + self._destroy_resource(volume_name, force) # Assume this call is atomic. volume_properties.clear() @@ -2208,12 +2269,6 @@ def _destroy_volume(self, volume_uuid): 'Cannot destroy volume `{}`: {}'.format(volume_uuid, e) ) - def _force_destroy_volume(self, volume_uuid): - try: - self._destroy_volume(volume_uuid) - except Exception as e: - self._logger('Ignore fail: {}'.format(e)) - def _build_volumes(self, repair): properties = self._kv_cache resource_names = self._fetch_resource_names() @@ -2283,7 +2338,7 @@ def _build_volumes(self, repair): # Little optimization, don't call `self._destroy_volume`, # we already have resource name list. if volume_name in resource_names: - self._destroy_resource(volume_name) + self._destroy_resource(volume_name, force=True) # Assume this call is atomic. properties.clear() From e85e9b90c8d3fd616fe92ad4d980ec7698a4be10 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Tue, 6 Dec 2022 11:22:15 +0100 Subject: [PATCH 093/133] fix(linstorvhdutil): ensure we retry creation in all situations Without this patch, a basic resource creation is never restarted after a failure. The classic situation is when a DRBD error is raised like `Failed to adjust DRBD resource`. This problem is rare but it exists. Signed-off-by: Ronan Abhamon --- drivers/linstorvolumemanager.py | 74 ++++++++++++++++----------------- 1 file changed, 35 insertions(+), 39 deletions(-) diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py index 2e2feb23..81cce802 100755 --- a/drivers/linstorvolumemanager.py +++ b/drivers/linstorvolumemanager.py @@ -2044,9 +2044,25 @@ def _create_volume( self, volume_uuid, volume_name, size, place_resources, no_diskless=False ): + if no_diskless and not place_resources: + raise LinstorVolumeManagerError( + 'Could not create volume `{}` from SR `{}`: it\'s impossible ' + .format(volume_uuid, self._group_name) + + 'to force no diskless without placing resources' + ) + size = self.round_up_volume_size(size) self._mark_resource_cache_as_dirty() + resources = [] + if no_diskless: + for node_name in self._get_node_names(): + resources.append(linstor.ResourceData( + node_name=node_name, + rsc_name=volume_name, + storage_pool=self._group_name + )) + def create_definition(): self._check_volume_creation_errors( self._linstor.resource_group_spawn( @@ -2060,39 +2076,6 @@ def create_definition(): ) self._increase_volume_peer_slots(self._linstor, volume_name) - # A. Basic case when we use the default redundancy of the group. - if not no_diskless: - create_definition() - if place_resources: - self._check_volume_creation_errors( - self._linstor.resource_auto_place( - rsc_name=volume_name, - place_count=self._redundancy, - diskless_on_remaining=not no_diskless - ), - volume_uuid, - self._group_name - ) - return - - # B. Complex case. - if not place_resources: - raise LinstorVolumeManagerError( - 'Could not create volume `{}` from SR `{}`: it\'s impossible ' - .format(volume_uuid, self._group_name) + - 'to force no diskless without placing resources' - ) - - # B.1. Create resource list. - resources = [] - for node_name in self._get_node_names(): - resources.append(linstor.ResourceData( - node_name=node_name, - rsc_name=volume_name, - storage_pool=self._group_name - )) - - # B.2. Create volume! def clean(): try: self._destroy_volume(volume_uuid, force=True) @@ -2105,13 +2088,26 @@ def clean(): def create(): try: create_definition() - result = self._linstor.resource_create(resources) - error_str = self._get_error_str(result) - if error_str: - raise LinstorVolumeManagerError( - 'Could not create volume `{}` from SR `{}`: {}'.format( - volume_uuid, self._group_name, error_str + if no_diskless: + # Create a physical resource on each node. + result = self._linstor.resource_create(resources) + error_str = self._get_error_str(result) + if error_str: + raise LinstorVolumeManagerError( + 'Could not create volume `{}` from SR `{}`: {}'.format( + volume_uuid, self._group_name, error_str + ) ) + elif place_resources: + # Basic case when we use the default redundancy of the group. + self._check_volume_creation_errors( + self._linstor.resource_auto_place( + rsc_name=volume_name, + place_count=self._redundancy, + diskless_on_remaining=not no_diskless + ), + volume_uuid, + self._group_name ) except LinstorVolumeManagerError as e: if e.code != LinstorVolumeManagerError.ERR_VOLUME_EXISTS: From cb5abc13889eff0c125d9a64d1d523b03b34c3e0 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Wed, 7 Dec 2022 17:56:39 +0100 Subject: [PATCH 094/133] fix(linstorvhdutil): don't retry local vhdutil call when EROFS is detected Signed-off-by: Ronan Abhamon --- drivers/linstorvhdutil.py | 79 ++++++++++++++++++++++++--------------- 1 file changed, 48 insertions(+), 31 deletions(-) diff --git a/drivers/linstorvhdutil.py b/drivers/linstorvhdutil.py index c2e9665f..63d59ab5 100644 --- a/drivers/linstorvhdutil.py +++ b/drivers/linstorvhdutil.py @@ -47,6 +47,14 @@ def call_vhd_util_on_host(session, host_ref, method, device_path, args): return response +class ErofsLinstorCallException(Exception): + def __init__(self, cmd_err): + self.cmd_err = cmd_err + + def __str__(self): + return str(self.cmd_err) + + def linstorhostcall(local_method, remote_method): def decorated(response_parser): def wrapper(*args, **kwargs): @@ -68,14 +76,17 @@ def wrapper(*args, **kwargs): local_e = None try: if not in_use or socket.gethostname() in node_names: - # Don't call `_call_local_vhd_util`, we don't want to - # trace failed calls now using opener files. It can be - # normal to have an exception. - def local_call(): - return local_method(device_path, *args[2:], **kwargs) - return util.retry(local_call, 5, 2) - except util.CommandException as local_e: - self._handle_local_vhd_util_error(local_e) + return self._call_local_vhd_util(local_method, device_path, *args[2:], **kwargs) + except ErofsLinstorCallException as e: + local_e = e.cmd_err + except Exception as e: + local_e = e + + util.SMlog( + 'unable to execute `{}` locally, retry using a readable host... (cause: {})'.format( + remote_method, local_e if local_e else 'local diskless + in use or not up to date' + ) + ) # B. Execute the plugin on master or slave. remote_args = { @@ -188,35 +199,35 @@ def get_block_bitmap(self, vdi_uuid, response): @linstormodifier() def create(self, path, size, static, msize=0): - return self._call_local_vhd_util(vhdutil.create, path, size, static, msize) + return self._call_local_vhd_util_or_fail(vhdutil.create, path, size, static, msize) @linstormodifier() def set_size_virt_fast(self, path, size): - return self._call_local_vhd_util(vhdutil.setSizeVirtFast, path, size) + return self._call_local_vhd_util_or_fail(vhdutil.setSizeVirtFast, path, size) @linstormodifier() def set_size_phys(self, path, size, debug=True): - return self._call_local_vhd_util(vhdutil.setSizePhys, path, size, debug) + return self._call_local_vhd_util_or_fail(vhdutil.setSizePhys, path, size, debug) @linstormodifier() def set_parent(self, path, parentPath, parentRaw=False): - return self._call_local_vhd_util(vhdutil.setParent, path, parentPath, parentRaw) + return self._call_local_vhd_util_or_fail(vhdutil.setParent, path, parentPath, parentRaw) @linstormodifier() def set_hidden(self, path, hidden=True): - return self._call_local_vhd_util(vhdutil.setHidden, path, hidden) + return self._call_local_vhd_util_or_fail(vhdutil.setHidden, path, hidden) @linstormodifier() def set_key(self, path, key_hash): - return self._call_local_vhd_util(vhdutil.setKey, path, key_hash) + return self._call_local_vhd_util_or_fail(vhdutil.setKey, path, key_hash) @linstormodifier() def kill_data(self, path): - return self._call_local_vhd_util(vhdutil.killData, path) + return self._call_local_vhd_util_or_fail(vhdutil.killData, path) @linstormodifier() def snapshot(self, path, parent, parentRaw, msize=0, checkEmpty=True): - return self._call_local_vhd_util(vhdutil.snapshot, path, parent, parentRaw, msize, checkEmpty) + return self._call_local_vhd_util_or_fail(vhdutil.snapshot, path, parent, parentRaw, msize, checkEmpty) # -------------------------------------------------------------------------- # Remote setters: write locally and try on another host in case of failure. @@ -299,21 +310,27 @@ def _raise_openers_exception(self, device_path, e): util.SMlog('raise opener exception: {} ({})'.format(e_wrapper, e_wrapper.reason)) raise e_wrapper # pylint: disable = E0702 - @staticmethod - def _handle_local_vhd_util_error(e): - if e.code != errno.EROFS and e.code != EMEDIUMTYPE: - util.SMlog('failed to execute locally vhd-util (sys {})'.format(e.code)) - def _call_local_vhd_util(self, local_method, device_path, *args, **kwargs): try: def local_call(): - return local_method(device_path, *args, **kwargs) - return util.retry(local_call, 5, 2) + try: + return local_method(device_path, *args, **kwargs) + except util.CommandException as e: + if e.code == errno.EROFS or e.code == EMEDIUMTYPE: + raise ErofsLinstorCallException(e) # Break retry calls. + raise e + # Retry only locally if it's not an EROFS exception. + return util.retry(local_call, 5, 2, exceptions=[util.CommandException]) except util.CommandException as e: - self._handle_local_vhd_util_error(e) + util.SMlog('failed to execute locally vhd-util (sys {})'.format(e.code)) + raise e - # Volume is locked on a host, find openers. - self._raise_openers_exception(device_path, e) + def _call_local_vhd_util_or_fail(self, local_method, device_path, *args, **kwargs): + try: + return self._call_local_vhd_util(local_method, device_path, *args, **kwargs) + except ErofsLinstorCallException as e: + # Volume is locked on a host, find openers. + self._raise_openers_exception(device_path, e) def _call_vhd_util(self, local_method, remote_method, device_path, use_parent, *args, **kwargs): # Note: `use_parent` exists to know if the VHD parent is used by the local/remote method. @@ -323,11 +340,11 @@ def _call_vhd_util(self, local_method, remote_method, device_path, use_parent, * # A. Try to write locally... try: - def local_call(): - return local_method(device_path, *args, **kwargs) - return util.retry(local_call, 5, 2) - except util.CommandException as e: - self._handle_local_vhd_util_error(e) + return self._call_local_vhd_util(local_method, device_path, *args, **kwargs) + except Exception: + pass + + util.SMlog('unable to execute `{}` locally, retry using a writable host...'.format(remote_method)) # B. Execute the command on another host. # B.1. Get host list. From 72569cc956994979938d3d5b5a65010023c01068 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Thu, 15 Dec 2022 14:36:04 +0100 Subject: [PATCH 095/133] feat(fork-log-daemon): ignore SIGTERM Without this patch, the output logs of the fork-log-daemon child are never displayed when SIGTERM is sent to the PGID. Signed-off-by: Ronan Abhamon --- scripts/fork-log-daemon | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/fork-log-daemon b/scripts/fork-log-daemon index eb0f0b0f..665a60ba 100755 --- a/scripts/fork-log-daemon +++ b/scripts/fork-log-daemon @@ -1,12 +1,14 @@ #!/usr/bin/env python import select +import signal import subprocess import sys import syslog def main(): process = subprocess.Popen(sys.argv[1:], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + signal.signal(signal.SIGTERM, signal.SIG_IGN) write_to_stdout = True while process.poll() is None: From 07c3fc446098ed7b4a55b5d183d56896881d361c Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Fri, 16 Dec 2022 16:52:50 +0100 Subject: [PATCH 096/133] feat(LinstorSR): wait for http-disk-server startup Avoid a race condition with NBD server. We must be sure the HTTP server is reachable before the NBD server execution, otherwise the HA activation may fail. Signed-off-by: Ronan Abhamon --- drivers/LinstorSR.py | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index 9e5b3cda..f336534e 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -2527,13 +2527,10 @@ def _snapshot(self, snap_type, cbtlog=None, cbt_consistency=None): @staticmethod def _start_persistent_http_server(volume_name): - null = None pid_path = None http_server = None try: - null = open(os.devnull, 'w') - if volume_name == 'xcp-persistent-ha-statefile': port = '8076' else: @@ -2566,8 +2563,8 @@ def _start_persistent_http_server(volume_name): util.SMlog('Starting {} on port {}...'.format(arguments[0], port)) http_server = subprocess.Popen( [FORK_LOG_DAEMON] + arguments, - stdout=null, - stderr=null, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, # Ensure we use another group id to kill this process without # touch the current one. preexec_fn=os.setsid @@ -2576,6 +2573,17 @@ def _start_persistent_http_server(volume_name): pid_path = '/run/http-server-{}.pid'.format(volume_name) with open(pid_path, 'w') as pid_file: pid_file.write(str(http_server.pid)) + + def is_ready(): + while http_server.poll() is None: + if http_server.stdout.readline().rstrip() == 'Server ready!': + return True + return False + try: + if not util.timeout_call(10, is_ready): + raise Exception('Failed to wait HTTP server startup, bad output') + except util.TimeoutException: + raise Exception('Failed to wait for HTTP server startup during given delay') except Exception as e: if pid_path: try: @@ -2594,19 +2602,13 @@ def _start_persistent_http_server(volume_name): 'VDIUnavailable', opterr='Failed to start http-server: {}'.format(e) ) - finally: - if null: - null.close() def _start_persistent_nbd_server(self, volume_name): - null = None pid_path = None nbd_path = None nbd_server = None try: - null = open(os.devnull, 'w') - if volume_name == 'xcp-persistent-ha-statefile': port = '8076' else: @@ -2643,6 +2645,10 @@ def _start_persistent_nbd_server(self, volume_name): preexec_fn=os.setsid ) + pid_path = '/run/nbd-server-{}.pid'.format(volume_name) + with open(pid_path, 'w') as pid_file: + pid_file.write(str(nbd_server.pid)) + reg_nbd_path = re.compile("^NBD `(/dev/nbd[0-9]+)` is now attached.$") def get_nbd_path(): while nbd_server.poll() is None: @@ -2658,10 +2664,6 @@ def get_nbd_path(): except util.TimeoutException: raise Exception('Unable to read NBD path') - pid_path = '/run/nbd-server-{}.pid'.format(volume_name) - with open(pid_path, 'w') as pid_file: - pid_file.write(str(nbd_server.pid)) - util.SMlog('Create symlink: {} -> {}'.format(self.path, nbd_path)) os.symlink(nbd_path, self.path) except Exception as e: @@ -2688,9 +2690,6 @@ def get_nbd_path(): 'VDIUnavailable', opterr='Failed to start nbd-server: {}'.format(e) ) - finally: - if null: - null.close() @classmethod def _kill_persistent_server(self, type, volume_name, sig): From 6556e572a997679c9e0ab883932f98f5acaa176e Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Mon, 16 Jan 2023 17:58:51 +0100 Subject: [PATCH 097/133] fix(LinstorSR): handle inflate + resize actions correctly - Ensure LINSTOR set the expected new volume size when inflate is executed, otherwise we log and we use the returned size. - Repair deflate calls in case of journal entry, cache is not usable. - Logs VHD and DRBD volume size in create/resize methods. - Ensure resize is only executed on master. - Use utilisation size instead of capacity in journal entries. Signed-off-by: Ronan Abhamon --- drivers/LinstorSR.py | 49 +++++++++++++++++++++++++++++++------------- 1 file changed, 35 insertions(+), 14 deletions(-) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index f336534e..72ec9de7 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -234,7 +234,7 @@ def inflate(journaler, linstor, vdi_uuid, vdi_path, new_size, old_size): return util.SMlog( - 'Inflate {} (new VHD size={}, previous={})' + 'Inflate {} (size={}, previous={})' .format(vdi_uuid, new_size, old_size) ) @@ -243,8 +243,15 @@ def inflate(journaler, linstor, vdi_uuid, vdi_path, new_size, old_size): ) linstor.resize_volume(vdi_uuid, new_size) + result_size = linstor.get_volume_size(vdi_uuid) + if result_size < new_size: + util.SMlog( + 'WARNING: Cannot inflate volume to {}B, result size: {}B' + .format(new_size, result_size) + ) + if not util.zeroOut( - vdi_path, new_size - vhdutil.VHD_FOOTER_SIZE, + vdi_path, result_size - vhdutil.VHD_FOOTER_SIZE, vhdutil.VHD_FOOTER_SIZE ): raise xs_errors.XenError( @@ -252,7 +259,7 @@ def inflate(journaler, linstor, vdi_uuid, vdi_path, new_size, old_size): opterr='Failed to zero out VHD footer {}'.format(vdi_path) ) - LinstorVhdUtil(None, linstor).set_size_phys(vdi_path, new_size, False) + LinstorVhdUtil(None, linstor).set_size_phys(vdi_path, result_size, False) journaler.remove(LinstorJournaler.INFLATE, vdi_uuid) @@ -1399,7 +1406,12 @@ def _handle_interrupted_inflate(self, vdi_uuid, old_size): util.SMlog('Cannot deflate missing VDI {}'.format(vdi_uuid)) return - current_size = self._all_volume_info_cache.get(self.uuid).virtual_size + assert not self._all_volume_info_cache + volume_info = self._linstor.get_volume_info(vdi_uuid) + + current_size = volume_info.virtual_size + assert current_size > 0 + util.zeroOut( vdi.path, current_size - vhdutil.VHD_FOOTER_SIZE, @@ -1695,11 +1707,11 @@ def create(self, sr_uuid, vdi_uuid, size): # 2. Compute size and check space available. size = vhdutil.validate_and_round_vhd_size(long(size)) - util.SMlog('LinstorVDI.create: type={}, size={}'.format( - self.vdi_type, size - )) - volume_size = compute_volume_size(size, self.vdi_type) + util.SMlog( + 'LinstorVDI.create: type={}, vhd-size={}, volume-size={}' + .format(self.vdi_type, size, volume_size) + ) self.sr._ensure_space_available(volume_size) # 3. Set sm_config attribute of VDI parent class. @@ -1917,9 +1929,23 @@ def detach(self, sr_uuid, vdi_uuid): def resize(self, sr_uuid, vdi_uuid, size): util.SMlog('LinstorVDI.resize for {}'.format(self.uuid)) + if not self.sr._is_master: + raise xs_errors.XenError( + 'VDISize', + opterr='resize on slave not allowed' + ) + if self.hidden: raise xs_errors.XenError('VDIUnavailable', opterr='hidden VDI') + # Compute the virtual VHD and DRBD volume size. + size = vhdutil.validate_and_round_vhd_size(long(size)) + volume_size = compute_volume_size(size, self.vdi_type) + util.SMlog( + 'LinstorVDI.resize: type={}, vhd-size={}, volume-size={}' + .format(self.vdi_type, size, volume_size) + ) + if size < self.size: util.SMlog( 'vdi_resize: shrinking not supported: ' @@ -1927,18 +1953,13 @@ def resize(self, sr_uuid, vdi_uuid, size): ) raise xs_errors.XenError('VDISize', opterr='shrinking not allowed') - # Compute the virtual VHD size. - size = vhdutil.validate_and_round_vhd_size(long(size)) - if size == self.size: return VDI.VDI.get_params(self) - # Compute the LINSTOR volume size. - new_volume_size = compute_volume_size(size, self.vdi_type) if self.vdi_type == vhdutil.VDI_TYPE_RAW: old_volume_size = self.size else: - old_volume_size = self.capacity + old_volume_size = self.utilisation if self.sr._provisioning == 'thin': # VDI is currently deflated, so keep it deflated. new_volume_size = old_volume_size From 386403fba98f0ba1584f568c0f24472515b86e9a Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Tue, 17 Jan 2023 11:55:00 +0100 Subject: [PATCH 098/133] fix(linstor-manager): add a static iptables rule for DRBD volumes Using the XAPI iptables firewall may drop DRBD packets when the connection tracking subsystem runs out of entries temporarily. Instead, use a static rule completely independent of the connection tracking module, it allow packets to pass even when the connection tracking table ran full temporarily. Signed-off-by: Ronan Abhamon --- drivers/linstor-manager | 48 ++++++++++++++++++++++++++++++++--------- 1 file changed, 38 insertions(+), 10 deletions(-) diff --git a/drivers/linstor-manager b/drivers/linstor-manager index 5c4c5c90..6ee435c6 100755 --- a/drivers/linstor-manager +++ b/drivers/linstor-manager @@ -40,11 +40,12 @@ LVM_PLUGIN = 'lvm.py' THIN_POOL = 'thin_pool' FIREWALL_PORT_SCRIPT = '/etc/xapi.d/plugins/firewall-port' -LINSTOR_PORTS = [3366, 3370, 3376, 3377, '7000:8000', 8076, 8077] +LINSTOR_PORTS = [3366, 3370, 3376, 3377, 8076, 8077] +DRBD_PORTS = '7000:8000' -def update_port(port, open): - fn = 'open' if open else 'close' +def update_linstor_port(port, open_ports): + fn = 'open' if open_ports else 'close' args = ( FIREWALL_PORT_SCRIPT, fn, str(port), 'tcp' ) @@ -55,9 +56,36 @@ def update_port(port, open): raise Exception('Failed to {} port: {} {}'.format(fn, out, err)) -def update_all_ports(open): +def has_iptables_rule(rule): + (ret, stdout, stderr) = util.doexec(['iptables', '-C'] + rule) + return not ret + + +def update_drbd_ports(open_ports): + # We want to use a static rule regarding DRBD volumes, + # so we can't use the XAPI firewall port script, we have to manually + # check for existing rules before updating iptables service. + rule = ['INPUT', '-p', 'tcp', '--dport', DRBD_PORTS, '-j', 'ACCEPT'] + if open_ports == has_iptables_rule(rule): + return + if open_ports: + rule.insert(1, '1') + (ret, stdout, stderr) = util.doexec(['iptables', '-I'] + rule) + if ret: + raise Exception('Failed to add DRBD rule: {}'.format(stderr)) + else: + (ret, stdout, stderr) = util.doexec(['iptables', '-D'] + rule) + if ret: + raise Exception('Failed to remove DRBD rule: {}'.format(stderr)) + (ret, stdout, stderr) = util.doexec(['service', 'iptables', 'save']) + if ret: + raise Exception('Failed to save DRBD rule: {}'.format(stderr)) + + +def update_all_ports(open_ports): for port in LINSTOR_PORTS: - update_port(port, open) + update_linstor_port(port, open_ports) + update_drbd_ports(open_ports) def update_linstor_satellite_service(start): @@ -202,7 +230,7 @@ def prepare_sr(session, args): try: LinstorSR.activate_lvm_group(args['groupName']) - update_all_ports(open=True) + update_all_ports(open_ports=True) # We don't want to enable and start minidrbdcluster daemon during # SR creation. update_minidrbdcluster_service(start=False) @@ -217,7 +245,7 @@ def release_sr(session, args): try: update_linstor_satellite_service(start=False) update_minidrbdcluster_service(start=False) - update_all_ports(open=False) + update_all_ports(open_ports=False) return str(True) except Exception as e: util.SMlog('linstor-manager:release_sr error: {}'.format(e)) @@ -533,7 +561,7 @@ def add_host(session, args): try: # 4. Enable services. - update_all_ports(open=True) + update_all_ports(open_ports=True) update_minidrbdcluster_service(start=True) update_linstor_satellite_service(start=True) @@ -664,7 +692,7 @@ def add_host(session, args): if stop_services and not linstor.has_node(node_name): update_linstor_satellite_service(start=False) update_minidrbdcluster_service(start=False) - update_all_ports(open=False) + update_all_ports(open_ports=False) except Exception: pass @@ -747,7 +775,7 @@ def remove_host(session, args): try: update_linstor_satellite_service(start=False) update_minidrbdcluster_service(start=False) - update_all_ports(open=False) + update_all_ports(open_ports=False) except Exception as e: util.SMlog('Error while stopping services: {}'.format(e)) pass From df9b4c57cbf4e02984d337ad148e5a8aba395709 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Fri, 3 Feb 2023 16:38:49 +0100 Subject: [PATCH 099/133] feat(LinstorSR): sync with last http-nbd-transfer version - Increase auto promote timeout of heartbeat VDI to reduce CPU usage - Modify server regexes - Force device size parameter of NBD servers Signed-off-by: Ronan Abhamon --- drivers/LinstorSR.py | 25 +++++++++++++++++++++---- drivers/linstorvolumemanager.py | 19 +++++++++++++++++++ 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index 72ec9de7..31f45055 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -1761,6 +1761,13 @@ def create(self, sr_uuid, vdi_uuid, size): METADATA_OF_POOL_TAG: '' } self._linstor.set_volume_metadata(self.uuid, volume_metadata) + + # Set the open timeout to 1min to reduce CPU usage + # in http-disk-server when a secondary server tries to open + # an already opened volume. + if self.ty == 'ha_statefile' or self.ty == 'redo_log': + self._linstor.set_auto_promote_timeout(self.uuid, 600) + self._linstor.mark_volume_as_persistent(self.uuid) except util.CommandException as e: failed = True @@ -2595,9 +2602,11 @@ def _start_persistent_http_server(volume_name): with open(pid_path, 'w') as pid_file: pid_file.write(str(http_server.pid)) + reg_server_ready = re.compile("Server ready!$") def is_ready(): while http_server.poll() is None: - if http_server.stdout.readline().rstrip() == 'Server ready!': + line = http_server.stdout.readline() + if reg_server_ready.search(line): return True return False try: @@ -2630,10 +2639,16 @@ def _start_persistent_nbd_server(self, volume_name): nbd_server = None try: + # We use a precomputed device size. + # So if the XAPI is modified, we must update these values! if volume_name == 'xcp-persistent-ha-statefile': + # See: https://github.com/xapi-project/xen-api/blob/703479fa448a8d7141954bb6e8964d8e25c4ac2e/ocaml/xapi/xha_statefile.ml#L32-L37 port = '8076' + device_size = 4 * 1024 * 1024 else: + # See: https://github.com/xapi-project/xen-api/blob/703479fa448a8d7141954bb6e8964d8e25c4ac2e/ocaml/database/redo_log.ml#L41-L44 port = '8077' + device_size = 256 * 1024 * 1024 try: session = util.timeout_call(5, util.get_localAPI_session) @@ -2653,7 +2668,9 @@ def _start_persistent_nbd_server(self, volume_name): '--nbd-name', volume_name, '--urls', - ','.join(map(lambda ip: 'http://' + ip + ':' + port, ips)) + ','.join(map(lambda ip: 'http://' + ip + ':' + port, ips)), + '--device-size', + str(device_size) ] util.SMlog('Starting {} using port {}...'.format(arguments[0], port)) @@ -2670,11 +2687,11 @@ def _start_persistent_nbd_server(self, volume_name): with open(pid_path, 'w') as pid_file: pid_file.write(str(nbd_server.pid)) - reg_nbd_path = re.compile("^NBD `(/dev/nbd[0-9]+)` is now attached.$") + reg_nbd_path = re.compile("NBD `(/dev/nbd[0-9]+)` is now attached.$") def get_nbd_path(): while nbd_server.poll() is None: line = nbd_server.stdout.readline() - match = reg_nbd_path.match(line) + match = reg_nbd_path.search(line) if match: return match.group(1) # Use a timeout to never block the smapi if there is a problem. diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py index 81cce802..e0f39e71 100755 --- a/drivers/linstorvolumemanager.py +++ b/drivers/linstorvolumemanager.py @@ -851,6 +851,25 @@ def get_volume_size(self, volume_uuid): ) return size * 1024 + + def set_auto_promote_timeout(self, volume_uuid, timeout): + """ + Define the blocking time of open calls when a DRBD + is already open on another host. + :param str volume_uuid: The volume uuid to modify. + """ + + volume_name = self.get_volume_name(volume_uuid) + result = self._linstor.resource_dfn_modify(volume_name, { + 'DrbdOptions/Resource/auto-promote-timeout': timeout + }) + error_str = self._get_error_str(result) + if error_str: + raise LinstorVolumeManagerError( + 'Could not change the auto promote timeout of `{}`: {}' + .format(volume_uuid, error_str) + ) + def get_volume_info(self, volume_uuid): """ Get the volume info of a particular volume. From 238deea99bc48e6aca0fbc90a79de72d28ae8b6f Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Mon, 13 Feb 2023 17:24:16 +0100 Subject: [PATCH 100/133] fix(LinstorSR): don't check VDI metadata while listing VDIs if it's deleted Signed-off-by: Ronan Abhamon --- drivers/LinstorSR.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index 31f45055..41ece825 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -1169,6 +1169,9 @@ def _load_vdis_ex(self): if not introduce: continue + if vdi_uuid.startswith('DELETED_'): + continue + volume_metadata = volumes_metadata.get(vdi_uuid) if not volume_metadata: util.SMlog( @@ -1177,9 +1180,6 @@ def _load_vdis_ex(self): ) continue - if vdi_uuid.startswith('DELETED_'): - continue - util.SMlog( 'Trying to introduce VDI {} as it is present in ' 'LINSTOR and not in XAPI...' From a8feb6a29c5e76776614f594fa63bb7a5a367a0d Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Mon, 13 Feb 2023 17:27:43 +0100 Subject: [PATCH 101/133] fix(LinstorSR): don't check metadata when destroying snap in undo_clone Remove useless check in the snap rollback helper when there is an error during the second `_create_snapshot` call (when VDI.clone command is executed). There is no reason to verify the metadata, this code is present in the LVM driver, but is useless here. Signed-off-by: Ronan Abhamon --- drivers/LinstorSR.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index 41ece825..94cf1b77 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -1496,10 +1496,6 @@ def _undo_clone(self, volume_names, vdi_uuid, base_uuid, snap_uuid): # Remove the child nodes. if snap_uuid and snap_uuid in volume_names: util.SMlog('Destroying snap {}...'.format(snap_uuid)) - snap_metadata = self._linstor.get_volume_metadata(snap_uuid) - - if snap_metadata.get(VDI_TYPE_TAG) != vhdutil.VDI_TYPE_VHD: - raise util.SMException('Clone {} not VHD'.format(snap_uuid)) try: self._linstor.destroy_volume(snap_uuid) From bdb190c46163af07fb1f9ab6315e1ad8dbdc979a Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Wed, 15 Feb 2023 11:34:54 +0100 Subject: [PATCH 102/133] fix(linstorvhdutil): handle correctly generic exceptions in _raise_openers_exception Signed-off-by: Ronan Abhamon --- drivers/linstorvhdutil.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/drivers/linstorvhdutil.py b/drivers/linstorvhdutil.py index 63d59ab5..05225e88 100644 --- a/drivers/linstorvhdutil.py +++ b/drivers/linstorvhdutil.py @@ -289,25 +289,26 @@ def _get_readonly_host(self, vdi_uuid, device_path, node_names): # -------------------------------------------------------------------------- def _raise_openers_exception(self, device_path, e): + if isinstance(e, util.CommandException): + e_str = 'cmd: `{}`, code: `{}`, reason: `{}`'.format(e.cmd, e.code, e.reason) + else: + e_str = str(e) + e_with_openers = None try: volume_uuid = self._linstor.get_volume_uuid_from_device_path( device_path ) - e_wrapper = util.CommandException( - e.code, - e.cmd, - e.reason + ' (openers: {})'.format( + e_wrapper = Exception( + e_str + ' (openers: {})'.format( self._linstor.get_volume_openers(volume_uuid) ) ) except Exception as illformed_e: - e_wrapper = util.CommandException( - e.code, - e.cmd, - e.reason + ' (unable to get openers: {})'.format(illformed_e) + e_wrapper = Exception( + e_str + ' (unable to get openers: {})'.format(illformed_e) ) - util.SMlog('raise opener exception: {} ({})'.format(e_wrapper, e_wrapper.reason)) + util.SMlog('raise opener exception: {}'.format(e_wrapper)) raise e_wrapper # pylint: disable = E0702 def _call_local_vhd_util(self, local_method, device_path, *args, **kwargs): @@ -330,7 +331,7 @@ def _call_local_vhd_util_or_fail(self, local_method, device_path, *args, **kwarg return self._call_local_vhd_util(local_method, device_path, *args, **kwargs) except ErofsLinstorCallException as e: # Volume is locked on a host, find openers. - self._raise_openers_exception(device_path, e) + self._raise_openers_exception(device_path, e.cmd_err) def _call_vhd_util(self, local_method, remote_method, device_path, use_parent, *args, **kwargs): # Note: `use_parent` exists to know if the VHD parent is used by the local/remote method. From f5b5cd809df4784cee44bba27dc2fc2b7d4383ce Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Thu, 16 Feb 2023 14:24:07 +0100 Subject: [PATCH 103/133] fix(minidrbdcluster): robustify to unmount correctly LINSTOR DB There is a small delay during which the database may not be unmounted because there are still processes using it. So must retry in this case. It's caused by the termination of the LINSTOR controller. Signed-off-by: Ronan Abhamon --- Makefile | 1 + etc/systemd/system/var-lib-linstor.service | 2 +- scripts/safe-umount | 39 ++++++++++++++++++++++ 3 files changed, 41 insertions(+), 1 deletion(-) create mode 100755 scripts/safe-umount diff --git a/Makefile b/Makefile index c38d5407..e6ea5cee 100755 --- a/Makefile +++ b/Makefile @@ -240,6 +240,7 @@ install: precheck mkdir -p $(SM_STAGING)$(LIBEXEC) install -m 755 scripts/fork-log-daemon $(SM_STAGING)$(LIBEXEC) install -m 755 scripts/linstor-kv-tool $(SM_STAGING)$(BIN_DEST) + install -m 755 scripts/safe-umount $(SM_STAGING)$(LIBEXEC) install -m 755 scripts/local-device-change $(SM_STAGING)$(LIBEXEC) install -m 755 scripts/check-device-sharing $(SM_STAGING)$(LIBEXEC) install -m 755 scripts/usb_change $(SM_STAGING)$(LIBEXEC) diff --git a/etc/systemd/system/var-lib-linstor.service b/etc/systemd/system/var-lib-linstor.service index d230d048..e9deb904 100644 --- a/etc/systemd/system/var-lib-linstor.service +++ b/etc/systemd/system/var-lib-linstor.service @@ -17,5 +17,5 @@ Description=Mount filesystem for the LINSTOR controller [Service] Type=oneshot ExecStart=/bin/mount -w /dev/drbd/by-res/xcp-persistent-database/0 /var/lib/linstor -ExecStop=/bin/umount /var/lib/linstor +ExecStop=/opt/xensource/libexec/safe-umount /var/lib/linstor RemainAfterExit=true diff --git a/scripts/safe-umount b/scripts/safe-umount new file mode 100755 index 00000000..9c1dcc40 --- /dev/null +++ b/scripts/safe-umount @@ -0,0 +1,39 @@ +#!/usr/bin/env python2 + +import argparse +import subprocess +import sys +import time + + +def safe_umount(path): + retry_count = 10 + not_mounted_str = 'umount: {}: not mounted'.format(path) + + last_code = 0 + while retry_count: + proc = subprocess.Popen(['mountpoint', '-q', path]) + proc.wait() + if proc.returncode: + return 0 + + proc = subprocess.Popen(['umount', path], stderr=subprocess.PIPE) + (stdout, stderr) = proc.communicate() + if not proc.returncode: + return 0 + + error = stderr.strip() + if error == not_mounted_str: + return 0 + + retry_count -= 1 + last_code = proc.returncode + time.sleep(0.500) + return last_code + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('path') + args = parser.parse_args() + sys.exit(safe_umount(args.path)) From ca09c4083accd00dbd41c90e754302347159ef00 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Mon, 20 Feb 2023 19:30:18 +0100 Subject: [PATCH 104/133] fix(minidrbdcluster): handle correctly KeyboardInterrupt with systemd units It's necessary to always add systemd services in the running list before trying to start a service, because if a KeyboardInterrupt is sent, we can have a running LINSTOR controller not present in the list, and then we can no longer unmount /var/lib/linstor because the controller is never stopped... Signed-off-by: Ronan Abhamon --- scripts/minidrbdcluster | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/minidrbdcluster b/scripts/minidrbdcluster index eae7cbfe..03d6b010 100755 --- a/scripts/minidrbdcluster +++ b/scripts/minidrbdcluster @@ -104,10 +104,11 @@ def process(events2, resources, running_services, status): res_name, may_promote, promotion_score = m.groups() if res_name in resources and may_promote == 'yes': for systemd_unit in resources[res_name]['systemd-units']: - if not ensure_systemd_started(systemd_unit): - break if systemd_unit not in running_services: running_services.append(systemd_unit) + if not ensure_systemd_started(systemd_unit): + running_services.pop() + break m = PEER_ROLE_RE.match(line) if m: res_name, conn_name, role = m.groups() From 66117da8b968daf4ce6ef76c7288599673bc4fc8 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Fri, 24 Feb 2023 14:28:29 +0100 Subject: [PATCH 105/133] feat(LinstorSR): use drbd-reactor instead of minidrbdcluster Signed-off-by: Ronan Abhamon --- Makefile | 7 -- drivers/LinstorSR.py | 37 +++--- drivers/linstor-manager | 67 +++++++++-- drivers/linstorvolumemanager.py | 4 +- etc/minidrbdcluster.ini | 14 --- scripts/minidrbdcluster | 203 -------------------------------- systemd/minidrbdcluster.service | 19 --- 7 files changed, 76 insertions(+), 275 deletions(-) delete mode 100644 etc/minidrbdcluster.ini delete mode 100755 scripts/minidrbdcluster delete mode 100644 systemd/minidrbdcluster.service diff --git a/Makefile b/Makefile index e6ea5cee..f8196cb1 100755 --- a/Makefile +++ b/Makefile @@ -99,7 +99,6 @@ MPATH_CUSTOM_CONF_DIR := /etc/multipath/conf.d/ MODPROBE_DIR := /etc/modprobe.d/ EXTENSION_SCRIPT_DEST := /etc/xapi.d/extensions/ LOGROTATE_DIR := /etc/logrotate.d/ -MINI_DRBD_CLUSTER_CONF_DIR := /etc/ SM_STAGING := $(DESTDIR) SM_STAMP := $(MY_OBJ_DIR)/.staging_stamp @@ -155,7 +154,6 @@ install: precheck mkdir -p $(SM_STAGING)$(MPATH_CUSTOM_CONF_DIR) mkdir -p $(SM_STAGING)$(MODPROBE_DIR) mkdir -p $(SM_STAGING)$(LOGROTATE_DIR) - mkdir -p $(SM_STAGING)$(MINI_DRBD_CLUSTER_CONF_DIR) mkdir -p $(SM_STAGING)$(DEBUG_DEST) mkdir -p $(SM_STAGING)$(BIN_DEST) mkdir -p $(SM_STAGING)$(MASTER_SCRIPT_DEST) @@ -183,8 +181,6 @@ install: precheck $(SM_STAGING)/$(SYSTEMD_CONF_DIR)/linstor-satellite.service.d/ install -m 644 etc/systemd/system/var-lib-linstor.service \ $(SM_STAGING)/$(SYSTEMD_CONF_DIR) - install -m 644 etc/minidrbdcluster.ini \ - $(SM_STAGING)/$(MINI_DRBD_CLUSTER_CONF_DIR) install -m 644 etc/make-dummy-sr.service \ $(SM_STAGING)/$(SYSTEMD_SERVICE_DIR) install -m 644 systemd/xs-sm.service \ @@ -203,8 +199,6 @@ install: precheck $(SM_STAGING)/$(SYSTEMD_SERVICE_DIR) install -m 644 systemd/linstor-monitor.service \ $(SM_STAGING)/$(SYSTEMD_SERVICE_DIR) - install -m 644 systemd/minidrbdcluster.service \ - $(SM_STAGING)/$(SYSTEMD_SERVICE_DIR) for i in $(UDEV_RULES); do \ install -m 644 udev/$$i.rules \ $(SM_STAGING)$(UDEV_RULES_DIR); done @@ -257,7 +251,6 @@ install: precheck install -m 755 scripts/xe-getlunidentifier $(SM_STAGING)$(BIN_DEST) install -m 755 scripts/make-dummy-sr $(SM_STAGING)$(LIBEXEC) install -m 755 scripts/storage-init $(SM_STAGING)$(LIBEXEC) - install -m 755 scripts/minidrbdcluster $(SM_STAGING)$(LIBEXEC) .PHONY: clean clean: diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index 94cf1b77..a3da28e7 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -461,6 +461,10 @@ def wrapped_method(self, *args, **kwargs): return method(self, *args, **kwargs) def load(self, *args, **kwargs): + # Activate all LVMs to make drbd-reactor happy. + if self.srcmd.cmd == 'sr_attach': + activate_lvm_group(self._group_name) + if not self._has_session: if self.srcmd.cmd in ( 'vdi_attach_from_config', @@ -707,7 +711,7 @@ def create(self, uuid, size): ) # Ensure ports are opened and LINSTOR satellites - # are activated. In the same time the minidrbdcluster instances + # are activated. In the same time the drbd-reactor instances # must be stopped. self._prepare_sr_on_all_hosts(self._group_name, enabled=True) @@ -730,9 +734,9 @@ def create(self, uuid, size): try: util.SMlog( - "Finishing SR creation, enable minidrbdcluster on all hosts..." + "Finishing SR creation, enable drbd-reactor on all hosts..." ) - self._update_minidrbdcluster_on_all_hosts(enabled=True) + self._update_drbd_reactor_on_all_hosts(enabled=True) except Exception as e: try: self._linstor.destroy() @@ -777,7 +781,7 @@ def delete(self, uuid): ) try: - self._update_minidrbdcluster_on_all_hosts( + self._update_drbd_reactor_on_all_hosts( controller_node_name=node_name, enabled=False ) @@ -789,12 +793,12 @@ def delete(self, uuid): ) except Exception as e: try: - self._update_minidrbdcluster_on_all_hosts( + self._update_drbd_reactor_on_all_hosts( controller_node_name=node_name, enabled=True ) except Exception as e2: util.SMlog( - 'Failed to restart minidrbdcluster after destroy fail: {}' + 'Failed to restart drbd-reactor after destroy fail: {}' .format(e2) ) util.SMlog('Failed to delete LINSTOR SR: {}'.format(e)) @@ -838,7 +842,6 @@ def attach(self, uuid): 'SRUnavailable', opterr='no such group: {}'.format(self._group_name) ) - activate_lvm_group(self._group_name) @_locked_load def detach(self, uuid): @@ -963,15 +966,15 @@ def _prepare_sr_on_all_hosts(self, group_name, enabled): for slave in util.get_all_slaves(self.session): self._prepare_sr(slave, group_name, enabled) - def _update_minidrbdcluster(self, host, enabled): + def _update_drbd_reactor(self, host, enabled): self._exec_manager_command( host, - 'updateMinidrbdcluster', + 'updateDrbdReactor', {'enabled': str(enabled)}, 'SRUnavailable' ) - def _update_minidrbdcluster_on_all_hosts( + def _update_drbd_reactor_on_all_hosts( self, enabled, controller_node_name=None ): if controller_node_name == 'localhost': @@ -999,27 +1002,27 @@ def _update_minidrbdcluster_on_all_hosts( )) if enabled and controller_host: - util.SMlog('{} minidrbdcluster on controller host `{}`...'.format( + util.SMlog('{} drbd-reactor on controller host `{}`...'.format( action_name, controller_node_name )) # If enabled is true, we try to start the controller on the desired # node name first. - self._update_minidrbdcluster(controller_host, enabled) + self._update_drbd_reactor(controller_host, enabled) for host_ref, hostname in secondary_hosts: - util.SMlog('{} minidrbdcluster on host {}...'.format( + util.SMlog('{} drbd-reactor on host {}...'.format( action_name, hostname )) - self._update_minidrbdcluster(host_ref, enabled) + self._update_drbd_reactor(host_ref, enabled) if not enabled and controller_host: - util.SMlog('{} minidrbdcluster on controller host `{}`...'.format( + util.SMlog('{} drbd-reactor on controller host `{}`...'.format( action_name, controller_node_name )) - # If enabled is false, we disable the minidrbdcluster service of + # If enabled is false, we disable the drbd-reactor service of # the controller host last. Why? Otherwise the linstor-controller # of other nodes can be started, and we don't want that. - self._update_minidrbdcluster(controller_host, enabled) + self._update_drbd_reactor(controller_host, enabled) # -------------------------------------------------------------------------- # Metadata. diff --git a/drivers/linstor-manager b/drivers/linstor-manager index 6ee435c6..7e34b5f6 100755 --- a/drivers/linstor-manager +++ b/drivers/linstor-manager @@ -22,6 +22,7 @@ sys.path[0] = '/opt/xensource/sm/' import base64 import distutils.util +import os import socket import XenAPI import XenAPIPlugin @@ -43,6 +44,19 @@ FIREWALL_PORT_SCRIPT = '/etc/xapi.d/plugins/firewall-port' LINSTOR_PORTS = [3366, 3370, 3376, 3377, 8076, 8077] DRBD_PORTS = '7000:8000' +DRBD_REACTOR_CONF = '/etc/drbd-reactor.d/sm-linstor.toml' + +DRBD_REACTOR_CONF_CONTENT = """[[promoter]] + +[promoter.resources.xcp-persistent-database] +start = [ "var-lib-linstor.service", "linstor-controller.service" ] +""" + +DRBD_REACTOR_DEPS = [ + '/run/systemd/system/linstor-controller.service.d/reactor.conf', + '/run/systemd/system/var-lib-linstor.service.d/reactor.conf' +] + def update_linstor_port(port, open_ports): fn = 'open' if open_ports else 'close' @@ -101,8 +115,35 @@ def update_linstor_satellite_service(start): util.enable_and_start_service(service, True) -def update_minidrbdcluster_service(start): - util.enable_and_start_service('minidrbdcluster', start) +def update_drbd_reactor_service(start): + if start: + util.atomicFileWrite(DRBD_REACTOR_CONF, None, DRBD_REACTOR_CONF_CONTENT) + else: + try: + os.remove(DRBD_REACTOR_CONF) + except Exception: + pass + + util.stop_service('drbd-reactor') + + try: + util.stop_service('drbd-promote@xcp\x2dpersistent\x2ddatabase.service') + except Exception as e: + if str(e).rstrip().endswith(' not loaded.'): + pass + raise e + + util.stop_service('linstor-controller') + util.stop_service('var-lib-linstor.service') + + for dep in DRBD_REACTOR_DEPS: + try: + os.remove(dep) + except Exception: + pass + + util.doexec(['systemctl', 'daemon-reload']) + util.enable_and_start_service('drbd-reactor', start) def exec_create_sr(session, name, description, disks, volume_group, redundancy, provisioning, force): @@ -231,9 +272,9 @@ def prepare_sr(session, args): LinstorSR.activate_lvm_group(args['groupName']) update_all_ports(open_ports=True) - # We don't want to enable and start minidrbdcluster daemon during + # We don't want to enable and start drbd-reactor daemon during # SR creation. - update_minidrbdcluster_service(start=False) + update_drbd_reactor_service(start=False) update_linstor_satellite_service(start=True) return str(True) except Exception as e: @@ -244,7 +285,7 @@ def prepare_sr(session, args): def release_sr(session, args): try: update_linstor_satellite_service(start=False) - update_minidrbdcluster_service(start=False) + update_drbd_reactor_service(start=False) update_all_ports(open_ports=False) return str(True) except Exception as e: @@ -252,14 +293,14 @@ def release_sr(session, args): return str(False) -def update_minidrbdcluster(session, args): +def update_drbd_reactor(session, args): try: enabled = distutils.util.strtobool(args['enabled']) - update_minidrbdcluster_service(start=enabled) + update_drbd_reactor_service(start=enabled) return str(True) except Exception as e: util.SMlog( - 'linstor-manager:update_minidrbdcluster error: {}'.format(e) + 'linstor-manager:update_drbd_reactor error: {}'.format(e) ) return str(False) @@ -308,7 +349,7 @@ def destroy(session, args): try: group_name = args['groupName'] - # When destroy is called, there are no running minidrbdcluster daemons. + # When destroy is called, there are no running drbd-reactor daemons. # So the controllers are stopped too, we must start an instance. util.restart_service('var-lib-linstor.service') util.restart_service('linstor-controller') @@ -562,7 +603,7 @@ def add_host(session, args): try: # 4. Enable services. update_all_ports(open_ports=True) - update_minidrbdcluster_service(start=True) + update_drbd_reactor_service(start=True) update_linstor_satellite_service(start=True) # 5. Try to create local node. @@ -691,7 +732,7 @@ def add_host(session, args): # If we failed to remove the node, we don't stop services. if stop_services and not linstor.has_node(node_name): update_linstor_satellite_service(start=False) - update_minidrbdcluster_service(start=False) + update_drbd_reactor_service(start=False) update_all_ports(open_ports=False) except Exception: pass @@ -774,7 +815,7 @@ def remove_host(session, args): # 3. Stop services. try: update_linstor_satellite_service(start=False) - update_minidrbdcluster_service(start=False) + update_drbd_reactor_service(start=False) update_all_ports(open_ports=False) except Exception as e: util.SMlog('Error while stopping services: {}'.format(e)) @@ -1005,7 +1046,7 @@ if __name__ == '__main__': XenAPIPlugin.dispatch({ 'prepareSr': prepare_sr, 'releaseSr': release_sr, - 'updateMinidrbdcluster': update_minidrbdcluster, + 'updateDrbdReactor': update_drbd_reactor, 'attach': attach, 'detach': detach, 'destroy': destroy, diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py index e0f39e71..4662043c 100755 --- a/drivers/linstorvolumemanager.py +++ b/drivers/linstorvolumemanager.py @@ -1622,7 +1622,7 @@ def create_sr( ) finally: # Controller must be stopped and volume unmounted because - # it is the role of the minidrbdcluster daemon to do the right + # it is the role of the drbd-reactor daemon to do the right # actions. cls._start_controller(start=False) cls._mount_volume( @@ -2625,7 +2625,7 @@ def _create_database_volume( ) # We must modify the quorum. Otherwise we can't use correctly the - # minidrbdcluster daemon. + # drbd-reactor daemon. if auto_quorum: result = lin.resource_dfn_modify(DATABASE_VOLUME_NAME, { 'DrbdOptions/auto-quorum': 'disabled', diff --git a/etc/minidrbdcluster.ini b/etc/minidrbdcluster.ini deleted file mode 100644 index 9e523427..00000000 --- a/etc/minidrbdcluster.ini +++ /dev/null @@ -1,14 +0,0 @@ -# minidrbdcluster keeps a service running on one of the nodes. -# Quorum must be enabled in the DRBD resource! -# -# The section names are the names of DRBD resources. Within a -# section name the systemd-units to activate on one of the nodes. - -[xcp-persistent-database] -systemd-units=var-lib-linstor.service,linstor-controller.service - -[xcp-persistent-ha-statefile] -systemd-units= - -[xcp-persistent-redo-log] -systemd-units= diff --git a/scripts/minidrbdcluster b/scripts/minidrbdcluster deleted file mode 100755 index 03d6b010..00000000 --- a/scripts/minidrbdcluster +++ /dev/null @@ -1,203 +0,0 @@ -#! /usr/bin/env python2 - -import configparser -import re -import signal -import subprocess - -DRBDADM_OPEN_FAILED_RE = re.compile( - 'open\\((.*)\\) failed: No such file or directory' -) -MAY_PROMOT_RE = re.compile( - '(?:exists|change) resource name:((?:\\w|-)+) ' - '(?:(?:\\w|-)+\\:(?:\\w|-)+ )*may_promote:(yes|no) promotion_score:(\\d+)' -) -PEER_ROLE_RE = re.compile( - '(?:exists|change) connection name:((?:\\w|-)+) peer-node-id:(?:\\d+) ' - 'conn-name:((?:\\w|-)+) (?:(?:\\w|-)+\\:(?:\\w|-)+ )*role:(Primary|Secondary|Unknown)' -) -HAVE_QUORUM_RE = re.compile( - '(?:exists|change) device name:((?:\\w|-)+) ' - '(?:(?:\\w|-)+\\:(?:\\w|-)+ )*quorum:(yes|no)' -) - - -class SigHupException(Exception): - pass - - -def sig_handler(sig, frame): - raise SigHupException( - 'Received signal ' + str(sig) + - ' on line ' + str(frame.f_lineno) + - ' in ' + frame.f_code.co_filename - ) - - -def preexec_subprocess(): - signal.signal(signal.SIGINT, signal.SIG_IGN) - - -def exec_subprocess(args): - proc = subprocess.Popen(args, preexec_fn=preexec_subprocess) - raise_sigint = False - while True: - try: - proc.wait() - break - except KeyboardInterrupt: - raise_sigint = True - except: # noqa: E722 - pass - - if raise_sigint: - raise KeyboardInterrupt - - return proc.returncode - - -def call_systemd(operation, service): - verbose = operation in ('start', 'stop') - if verbose: - print('Trying to %s %s' % (operation, service)) - ret = exec_subprocess(['systemctl', operation, service]) - if verbose: - print('%s for %s %s' % ( - 'success' if ret == 0 else 'failure', operation, service - )) - return ret == 0 - - -def ensure_systemd_started(service): - if not exec_subprocess(['systemctl', 'is-active', '--quiet', service]): - return True # Already active. - - return call_systemd('start', service) - - -def show_status(services, status): - print('status:') - for systemd_unit in services: - call_systemd('status', systemd_unit) - for res_name in status: - print('%s is %s' % (res_name, status[res_name])) - - -def stop_services(services): - for systemd_unit in reversed(services): - call_systemd('stop', systemd_unit) - - -def get_systemd_units(systemd_units_str): - systemd_units = [] - for systemd_unit in systemd_units_str.split(','): - systemd_unit = systemd_unit.strip() - if systemd_unit: - systemd_units.append(systemd_unit) - return systemd_units - - -def process(events2, resources, running_services, status): - line = events2.stdout.readline() - m = MAY_PROMOT_RE.match(line) - if m: - res_name, may_promote, promotion_score = m.groups() - if res_name in resources and may_promote == 'yes': - for systemd_unit in resources[res_name]['systemd-units']: - if systemd_unit not in running_services: - running_services.append(systemd_unit) - if not ensure_systemd_started(systemd_unit): - running_services.pop() - break - m = PEER_ROLE_RE.match(line) - if m: - res_name, conn_name, role = m.groups() - if res_name in status: - status[res_name][conn_name] = role - m = HAVE_QUORUM_RE.match(line) - if m: - res_name, have_quorum = m.groups() - if res_name in resources and have_quorum == 'no': - systemd_units = resources[res_name]['systemd-units'] - to_stop = [x for x in systemd_units if x in running_services] - if to_stop: - print('Lost quorum on %s' % (res_name)) - for systemd_unit in reversed(to_stop): - r = call_systemd('stop', systemd_unit) - if r: - running_services.remove(systemd_unit) - - -def active_drbd_volume(res_name): - retry = True - args = ['drbdadm', 'adjust', res_name] - while True: - proc = subprocess.Popen(args, stderr=subprocess.PIPE) - (stdout, stderr) = proc.communicate() - if not proc.returncode: - return # Success. \o/ - - if not retry: - break - - m = DRBDADM_OPEN_FAILED_RE.match(stderr) - if m and subprocess.call(['lvchange', '-ay', m.groups()[0]]) == 0: - retry = False - else: - break - - print('Failed to execute `{}`: {}'.format(args, stderr)) - - -def main(): - # 1. Load minidrbdcluster config. - config = configparser.ConfigParser() - config.read('/etc/minidrbdcluster.ini') - resources = config._sections - if not resources: - raise Exception( - 'No resources to watch, maybe /etc/minidrbdcluster.ini missing' - ) - print('Managing DRBD resources: %s' % (' '.join(resources))) - - # 2. Prepare resources. - status = dict() - all_services = [] # Contains common services between each DRBD volumes. - for res_name, resource in resources.iteritems(): - status[res_name] = dict() - active_drbd_volume(res_name) - systemd_units = get_systemd_units(resource['systemd-units']) - resource['systemd-units'] = systemd_units - - for systemd_unit in systemd_units: - if systemd_unit not in all_services: - all_services.append(systemd_unit) - - # 3. Ensure all services are stopped. - stop_services(all_services) - - # 4. Run! - signal.signal(signal.SIGHUP, sig_handler) - - running_services = [] - - print('Starting process...') - events2 = subprocess.Popen( - ['drbdsetup', 'events2'], stdout=subprocess.PIPE - ) - run = True - while run: - try: - process(events2, resources, running_services, status) - except KeyboardInterrupt: - run = False - except SigHupException: - show_status(running_services, status) - except Exception: - print('Unhandled exception: %s' % str(e)) - - print('Exiting...') - stop_services(running_services) - -if __name__ == '__main__': - main() diff --git a/systemd/minidrbdcluster.service b/systemd/minidrbdcluster.service deleted file mode 100644 index 1ddf91f3..00000000 --- a/systemd/minidrbdcluster.service +++ /dev/null @@ -1,19 +0,0 @@ -[Unit] -Description=Minimalistic high-availability cluster resource manager -Before=xs-sm.service -Wants=network-online.target -After=network-online.target - -[Service] -Type=simple -Environment=PYTHONUNBUFFERED=1 -ExecStart=/opt/xensource/libexec/minidrbdcluster -KillMode=process -KillSignal=SIGINT -SendSIGKILL=no -StandardOutput=journal -StandardError=journal -SyslogIdentifier=minidrbdcluster - -[Install] -WantedBy=multi-user.target From fe69c81c7d2270aca4e1ebb00756aa2fda59124d Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Wed, 1 Mar 2023 10:56:43 +0100 Subject: [PATCH 106/133] fix(LinstorSR): ensure vhdutil calls are correctly executed on pools with > 3 hosts Signed-off-by: Ronan Abhamon --- drivers/LinstorSR.py | 20 +++++++------------- drivers/linstorvhdutil.py | 19 ++++++++++++++++--- drivers/linstorvolumemanager.py | 8 ++++---- 3 files changed, 27 insertions(+), 20 deletions(-) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index a3da28e7..10e0f543 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -1884,9 +1884,12 @@ def attach(self, sr_uuid, vdi_uuid): return self._attach_using_http_nbd() if not util.pathexists(self.path): - raise xs_errors.XenError( - 'VDIUnavailable', opterr='Could not find: {}'.format(self.path) - ) + # Ensure we have a path... + self._linstor.get_device_path(vdi_uuid) + if not util.pathexists(self.path): + raise xs_errors.XenError( + 'VDIUnavailable', opterr='Could not find: {}'.format(self.path) + ) self.attached = True return VDI.VDI.attach(self, self.sr.uuid, self.uuid) @@ -2137,16 +2140,7 @@ def _load_this(self): self.size = volume_info.virtual_size self.parent = '' else: - try: - vhd_info = self.sr._vhdutil.get_vhd_info(self.uuid) - except util.CommandException as e: - if e.code != errno.ENOENT: - raise - # Path doesn't exist. Probably a diskless without local path. - # Force creation and retry. - self._linstor.get_device_path(self.uuid) - vhd_info = self.sr._vhdutil.get_vhd_info(self.uuid) - + vhd_info = self.sr._vhdutil.get_vhd_info(self.uuid) self.hidden = vhd_info.hidden self.size = vhd_info.sizeVirt self.parent = vhd_info.parentUuid diff --git a/drivers/linstorvhdutil.py b/drivers/linstorvhdutil.py index 05225e88..c1b817d7 100644 --- a/drivers/linstorvhdutil.py +++ b/drivers/linstorvhdutil.py @@ -47,7 +47,7 @@ def call_vhd_util_on_host(session, host_ref, method, device_path, args): return response -class ErofsLinstorCallException(Exception): +class LinstorCallException(Exception): def __init__(self, cmd_err): self.cmd_err = cmd_err @@ -55,6 +55,14 @@ def __str__(self): return str(self.cmd_err) +class ErofsLinstorCallException(LinstorCallException): + pass + + +class NoPathLinstorCallException(LinstorCallException): + pass + + def linstorhostcall(local_method, remote_method): def decorated(response_parser): def wrapper(*args, **kwargs): @@ -70,12 +78,12 @@ def wrapper(*args, **kwargs): # Try to read locally if the device is not in use or if the device # is up to date and not diskless. - (node_names, in_use) = \ + (node_names, in_use_by) = \ self._linstor.find_up_to_date_diskful_nodes(vdi_uuid) local_e = None try: - if not in_use or socket.gethostname() in node_names: + if not in_use_by or socket.gethostname() in node_names: return self._call_local_vhd_util(local_method, device_path, *args[2:], **kwargs) except ErofsLinstorCallException as e: local_e = e.cmd_err @@ -88,6 +96,9 @@ def wrapper(*args, **kwargs): ) ) + if in_use_by: + node_names = {in_use_by} + # B. Execute the plugin on master or slave. remote_args = { 'devicePath': device_path, @@ -319,6 +330,8 @@ def local_call(): except util.CommandException as e: if e.code == errno.EROFS or e.code == EMEDIUMTYPE: raise ErofsLinstorCallException(e) # Break retry calls. + if e.code == errno.ENOENT: + raise NoPathLinstorCallException(e) raise e # Retry only locally if it's not an EROFS exception. return util.retry(local_call, 5, 2, exceptions=[util.CommandException]) diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py index 4662043c..5ab83c41 100755 --- a/drivers/linstorvolumemanager.py +++ b/drivers/linstorvolumemanager.py @@ -1410,12 +1410,12 @@ def find_up_to_date_diskful_nodes(self, volume_uuid): The disk must be up to data to be used. :param str volume_uuid: The volume to use. :return: The available nodes. - :rtype: tuple(set(str), bool) + :rtype: tuple(set(str), str) """ volume_name = self.get_volume_name(volume_uuid) - in_use = False + in_use_by = None node_names = set() resource_states = filter( @@ -1428,9 +1428,9 @@ def find_up_to_date_diskful_nodes(self, volume_uuid): if volume_state.disk_state == 'UpToDate': node_names.add(resource_state.node_name) if resource_state.in_use: - in_use = True + in_use_by = resource_state.node_name - return (node_names, in_use) + return (node_names, in_use_by) def invalidate_resource_cache(self): """ From 5930118c466c092ca37601426dbe55f88878e7e4 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Thu, 9 Mar 2023 17:06:59 +0100 Subject: [PATCH 107/133] fix(LinstorSR): replace bad param in detach_thin impl To get the physical size, the volume UUID must be used, not the path. Signed-off-by: Ronan Abhamon --- drivers/LinstorSR.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index 10e0f543..c42f07d6 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -215,7 +215,7 @@ def detach_thin(session, linstor, sr_uuid, vdi_uuid): new_volume_size = LinstorVolumeManager.round_up_volume_size( # TODO: Replace pylint comment with this feature when possible: # https://github.com/PyCQA/pylint/pull/2926 - LinstorVhdUtil(session, linstor).get_size_phys(device_path) # pylint: disable = E1120 + LinstorVhdUtil(session, linstor).get_size_phys(vdi_uuid) # pylint: disable = E1120 ) volume_info = linstor.get_volume_info(vdi_uuid) From dc046eac96368086bfcb73952eac89b61f24e255 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Fri, 10 Mar 2023 18:11:10 +0100 Subject: [PATCH 108/133] fix(linstorvolumemanager): remove usage of realpath Because a diskless DRBD path not always exist, get_volume_name_from_device_path can fail. It's easy to reproduce using > 4 hosts and with a call to linstorvhdutil.get_vhd_info: This problem can occur if the parent of a VHD is not on the same machine and if this parent doesn't have a DRBD path locally. Signed-off-by: Ronan Abhamon --- drivers/linstorvolumemanager.py | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py index 5ab83c41..8befb33f 100755 --- a/drivers/linstorvolumemanager.py +++ b/drivers/linstorvolumemanager.py @@ -914,28 +914,17 @@ def get_volume_uuid_from_device_path(self, device_path): def get_volume_name_from_device_path(self, device_path): """ - Get the volume name of a device_path on the current host. + Get the volume name of a device_path. :param str device_path: The dev path to find the volume name. - :return: The volume name of the local device path. + :return: The volume name of the device path. :rtype: str """ - node_name = socket.gethostname() - - resources = filter( - lambda resource: resource.node_name == node_name, - self._get_resource_cache().resources - ) - - real_device_path = os.path.realpath(device_path) - for resource in resources: - if resource.volumes[0].device_path == real_device_path: - return resource.name + assert device_path.startswith(DRBD_BY_RES_PATH) - raise LinstorVolumeManagerError( - 'Unable to find volume name from dev path `{}`' - .format(device_path) - ) + res_name_end = device_path.find('/', len(DRBD_BY_RES_PATH)) + assert res_name_end != -1 + return device_path[len(DRBD_BY_RES_PATH):res_name_end] def update_volume_uuid(self, volume_uuid, new_volume_uuid, force=False): """ From 0b046a67bf9529c68963ae5e839a806ea5bef5e0 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Thu, 16 Mar 2023 18:54:46 +0100 Subject: [PATCH 109/133] fix(linstorvhdutil): avoid parent path resolution When many hosts are used (>= 4), we can fail to get VHD info (with parent option) because the local parent VDI path can be absent (no DRBD diskless path). So it's necessary to deactivate parent resolution: - vhdutil has been patched to support that - vhdutil returns a relative path now when "-u" option is used Signed-off-by: Ronan Abhamon --- drivers/linstor-manager | 2 +- drivers/linstorvhdutil.py | 9 ++++++--- drivers/linstorvolumemanager.py | 13 ++++++++++--- drivers/vhdutil.py | 5 ++++- 4 files changed, 21 insertions(+), 8 deletions(-) diff --git a/drivers/linstor-manager b/drivers/linstor-manager index 7e34b5f6..9e96aaca 100755 --- a/drivers/linstor-manager +++ b/drivers/linstor-manager @@ -400,7 +400,7 @@ def get_vhd_info(session, args): ) vhd_info = vhdutil.getVHDInfo( - device_path, extract_uuid, include_parent + device_path, extract_uuid, include_parent, False ) return json.dumps(vhd_info.__dict__) except Exception as e: diff --git a/drivers/linstorvhdutil.py b/drivers/linstorvhdutil.py index c1b817d7..8b6985d9 100644 --- a/drivers/linstorvhdutil.py +++ b/drivers/linstorvhdutil.py @@ -143,8 +143,8 @@ def __init__(self, session, linstor): def check(self, vdi_uuid, ignore_missing_footer=False, fast=False): kwargs = { - 'ignoreMissingFooter': str(ignore_missing_footer), - 'fast': str(fast) + 'ignoreMissingFooter': ignore_missing_footer, + 'fast': fast } return self._check(vdi_uuid, **kwargs) # pylint: disable = E1123 @@ -153,7 +153,10 @@ def _check(self, vdi_uuid, response): return distutils.util.strtobool(response) def get_vhd_info(self, vdi_uuid, include_parent=True): - kwargs = {'includeParent': str(include_parent)} + kwargs = { + 'includeParent': include_parent, + 'resolveParent': False + } # TODO: Replace pylint comment with this feature when possible: # https://github.com/PyCQA/pylint/pull/2926 return self._get_vhd_info(vdi_uuid, self._extract_uuid, **kwargs) # pylint: disable = E1123 diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py index 8befb33f..91db3d80 100755 --- a/drivers/linstorvolumemanager.py +++ b/drivers/linstorvolumemanager.py @@ -920,11 +920,18 @@ def get_volume_name_from_device_path(self, device_path): :rtype: str """ - assert device_path.startswith(DRBD_BY_RES_PATH) + # Assume that we have a path like this: + # - "/dev/drbd/by-res/xcp-volume-/0" + # - "../xcp-volume-/0" + if device_path.startswith(DRBD_BY_RES_PATH): + prefix_len = len(DRBD_BY_RES_PATH) + else: + assert device_path.startswith('../') + prefix_len = 3 - res_name_end = device_path.find('/', len(DRBD_BY_RES_PATH)) + res_name_end = device_path.find('/', prefix_len) assert res_name_end != -1 - return device_path[len(DRBD_BY_RES_PATH):res_name_end] + return device_path[prefix_len:res_name_end] def update_volume_uuid(self, volume_uuid, new_volume_uuid, force=False): """ diff --git a/drivers/vhdutil.py b/drivers/vhdutil.py index d75edb11..48337f87 100755 --- a/drivers/vhdutil.py +++ b/drivers/vhdutil.py @@ -100,13 +100,16 @@ def fullSizeVHD(virtual_size): def ioretry(cmd, errlist=[errno.EIO, errno.EAGAIN]): return util.ioretry(lambda: util.pread2(cmd), errlist) -def getVHDInfo(path, extractUuidFunction, includeParent = True): +def getVHDInfo(path, extractUuidFunction, includeParent=True, resolveParent=True): """Get the VHD info. The parent info may optionally be omitted: vhd-util tries to verify the parent by opening it, which results in error if the VHD resides on an inactive LV""" opts = "-vsf" if includeParent: opts += "p" + if not resolveParent: + opts += "u" + cmd = [VHD_UTIL, "query", OPT_LOG_ERR, opts, "-n", path] ret = ioretry(cmd) fields = ret.strip().split('\n') From eefaba4cd6080f5232f7485c7b774f61a6143ca9 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Fri, 17 Mar 2023 12:06:08 +0100 Subject: [PATCH 110/133] fix(LinstorSR): create parent path during attach It's necessary to force DRBD diskless path creation when a VDI is attached. Otherwise the attach can fail on pool with >= 4 hosts. Signed-off-by: Ronan Abhamon --- drivers/LinstorSR.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index c42f07d6..48feec7a 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -1883,13 +1883,14 @@ def attach(self, sr_uuid, vdi_uuid): ): return self._attach_using_http_nbd() - if not util.pathexists(self.path): - # Ensure we have a path... - self._linstor.get_device_path(vdi_uuid) - if not util.pathexists(self.path): + # Ensure we have a path... + while vdi_uuid: + path = self._linstor.get_device_path(vdi_uuid) + if not util.pathexists(path): raise xs_errors.XenError( - 'VDIUnavailable', opterr='Could not find: {}'.format(self.path) + 'VDIUnavailable', opterr='Could not find: {}'.format(path) ) + vdi_uuid = self.sr._vhdutil.get_vhd_info(vdi_uuid).parentUuid self.attached = True return VDI.VDI.attach(self, self.sr.uuid, self.uuid) From 946ac8a29381a1d46ea78f0f4d1ec28ba5f64c2d Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Mon, 3 Apr 2023 10:03:57 +0200 Subject: [PATCH 111/133] fix(LinstorSR): retry if we can't build volume cache Otherwise after SR creation, the master PBD can be unplugged. See: https://xcp-ng.org/forum/post/60726 Signed-off-by: Ronan Abhamon --- drivers/LinstorSR.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index 48feec7a..324033a0 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -1553,7 +1553,11 @@ def _undo_clone(self, volume_names, vdi_uuid, base_uuid, snap_uuid): def _create_linstor_cache(self): self._all_volume_metadata_cache = \ self._linstor.get_volumes_with_metadata() - self._all_volume_info_cache = self._linstor.get_volumes_with_info() + self._all_volume_info_cache = util.retry( + self._linstor.get_volumes_with_info, + maxretry=10, + period=1 + ) def _destroy_linstor_cache(self): self._all_volume_info_cache = None From f06b45f7999d0fb8d4c35f8374c4574d19a10e8e Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Tue, 25 Apr 2023 10:46:00 +0200 Subject: [PATCH 112/133] fix(linstorvolumemanager): reduce peer-slots param to 3 Because we use 3 backing disks at most, it's useless to increase the default linstor limit (8). Diskless is a resource that does not count in the peer-slots param. Note: this change is important to reduce the RAM usage, see => https://linbit.com/drbd-user-guide/drbd-guide-9_0-en/#s-meta-data-size Signed-off-by: Ronan Abhamon --- drivers/linstorvolumemanager.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py index 91db3d80..6f20c02c 100755 --- a/drivers/linstorvolumemanager.py +++ b/drivers/linstorvolumemanager.py @@ -2089,7 +2089,7 @@ def create_definition(): volume_uuid, self._group_name ) - self._increase_volume_peer_slots(self._linstor, volume_name) + self._configure_volume_peer_slots(self._linstor, volume_name) def clean(): try: @@ -2475,12 +2475,12 @@ def connect(uri): ) @classmethod - def _increase_volume_peer_slots(cls, lin, volume_name): - result = lin.resource_dfn_modify(volume_name, {}, peer_slots=31) + def _configure_volume_peer_slots(cls, lin, volume_name): + result = lin.resource_dfn_modify(volume_name, {}, peer_slots=3) error_str = cls._get_error_str(result) if error_str: raise LinstorVolumeManagerError( - 'Could not increase volume peer slots of {}: {}' + 'Could not configure volume peer slots of {}: {}' .format(volume_name, error_str) ) @@ -2581,7 +2581,7 @@ def _create_database_volume( vlm_sizes=['{}B'.format(size)], definitions_only=True ), DATABASE_VOLUME_NAME, group_name) - cls._increase_volume_peer_slots(lin, DATABASE_VOLUME_NAME) + cls._configure_volume_peer_slots(lin, DATABASE_VOLUME_NAME) # Create real resources on the first nodes. resources = [] From 6791d8b66b66c9811b0de3b6ce71df2d4808d699 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Tue, 25 Apr 2023 11:20:55 +0200 Subject: [PATCH 113/133] fix(LinstorSR): attach a valid XAPI session is_open is called Signed-off-by: Ronan Abhamon --- drivers/on_slave.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/on_slave.py b/drivers/on_slave.py index 0633cffa..7b8c55d9 100755 --- a/drivers/on_slave.py +++ b/drivers/on_slave.py @@ -124,6 +124,12 @@ def _is_open(session, args): driver = SR.driver(srType) sr = driver(cmd, sr_uuid) + + # session_ref param is required to have a valid session when SR object is created. + # It's not the case here, so attach the current session object to make LinstorSR happy. + if srType == 'linstor': + sr.session = session + vdi = sr.vdi(vdiUuid) tapdisk = blktap2.Tapdisk.find_by_path(vdi.path) util.SMlog("Tapdisk for %s: %s" % (vdi.path, tapdisk)) From a4d1c07683c4847dd2b1770458b30933a0cde3c2 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Fri, 28 Apr 2023 10:43:27 +0200 Subject: [PATCH 114/133] fix(LinstorSR): ensure we always have a DRBD path to snap Signed-off-by: Ronan Abhamon --- drivers/LinstorSR.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index 324033a0..8c0b007d 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -2392,6 +2392,9 @@ def _snapshot(self, snap_type, cbtlog=None, cbt_consistency=None): elif depth >= vhdutil.MAX_CHAIN_SIZE: raise xs_errors.XenError('SnapshotChainTooLong') + # Ensure we have a valid path if we don't have a local diskful. + self.sr._linstor.get_device_path(self.uuid) + volume_path = self.path if not util.pathexists(volume_path): raise xs_errors.XenError( From cf80547b27489833dd87093743bd82bfe435a01c Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Tue, 30 May 2023 11:19:13 +0200 Subject: [PATCH 115/133] fix(LinstorSR): remove hosts/ips param --- drivers/LinstorSR.py | 47 +++++---------------------------- drivers/linstorvolumemanager.py | 11 +++----- 2 files changed, 11 insertions(+), 47 deletions(-) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index 8c0b007d..7a9cbac6 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -100,8 +100,6 @@ CONFIGURATION = [ ['group-name', 'LVM group name'], - ['hosts', 'host names to use'], - ['ips', 'ips to use (optional, defaults to management networks)'], ['redundancy', 'replication count'], ['provisioning', '"thin" or "thick" are accepted (optional, defaults to thin)'], ['monitor-db-quorum', 'disable controller when only one host is online (optional, defaults to true)'] @@ -353,7 +351,6 @@ def activate_lvm_group(group_name): # Usage example: # xe sr-create type=linstor name-label=linstor-sr # host-uuid=d2deba7a-c5ad-4de1-9a20-5c8df3343e93 -# device-config:hosts=node-linstor1,node-linstor2,node-linstor3 # device-config:group-name=vg_loop device-config:redundancy=2 @@ -385,8 +382,6 @@ def load(self, sr_uuid): ) # Check parameters. - if 'hosts' not in self.dconf or not self.dconf['hosts']: - raise xs_errors.XenError('LinstorConfigHostsMissing') if 'group-name' not in self.dconf or not self.dconf['group-name']: raise xs_errors.XenError('LinstorConfigGroupNameMissing') if 'redundancy' not in self.dconf or not self.dconf['redundancy']: @@ -431,12 +426,6 @@ def load(self, sr_uuid): self.lock = Lock(vhdutil.LOCK_TYPE_SR, self.uuid) self.sr_vditype = SR.DEFAULT_TAP - self._hosts = list(set(self.dconf['hosts'].split(','))) - if 'ips' not in self.dconf or not self.dconf['ips']: - self._ips = None - else: - self._ips = self.dconf['ips'].split(',') - if self.cmd == 'sr_create': self._redundancy = int(self.dconf['redundancy']) or 1 self._linstor = None # Ensure that LINSTOR attribute exists. @@ -647,7 +636,8 @@ def cleanup(self): def create(self, uuid, size): util.SMlog('LinstorSR.create for {}'.format(self.uuid)) - if self._redundancy > len(self._hosts): + host_adresses = util.get_host_addresses(self.session) + if self._redundancy > len(host_adresses): raise xs_errors.XenError( 'LinstorSRCreate', opterr='Redundancy greater than host count' @@ -676,39 +666,17 @@ def create(self, uuid, size): ) online_hosts = util.get_online_hosts(self.session) - if len(online_hosts) < len(self._hosts): + if len(online_hosts) < len(host_adresses): raise xs_errors.XenError( 'LinstorSRCreate', opterr='Not enough online hosts' ) ips = {} - if not self._ips: - for host in online_hosts: - record = self.session.xenapi.host.get_record(host) - hostname = record['hostname'] - if hostname in self._hosts: - ips[hostname] = record['address'] - elif len(self._ips) != len(self._hosts): - raise xs_errors.XenError( - 'LinstorSRCreate', - opterr='ips must be equal to host count' - ) - else: - for host in online_hosts: - record = self.session.xenapi.host.get_record(host) - hostname = record['hostname'] - try: - index = self._hosts.index(hostname) - ips[hostname] = self._ips[index] - except ValueError as e: - pass - - if len(ips) != len(self._hosts): - raise xs_errors.XenError( - 'LinstorSRCreate', - opterr='Not enough online hosts' - ) + for host_ref in online_hosts: + record = self.session.xenapi.host.get_record(host_ref) + hostname = record['hostname'] + ips[hostname] = record['address'] # Ensure ports are opened and LINSTOR satellites # are activated. In the same time the drbd-reactor instances @@ -720,7 +688,6 @@ def create(self, uuid, size): try: self._linstor = LinstorVolumeManager.create_sr( self._group_name, - self._hosts, ips, self._redundancy, thin_provisioning=self._provisioning == 'thin', diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py index 6f20c02c..464ab2ce 100755 --- a/drivers/linstorvolumemanager.py +++ b/drivers/linstorvolumemanager.py @@ -1588,14 +1588,13 @@ def get_resources_info(self): @classmethod def create_sr( - cls, group_name, node_names, ips, redundancy, + cls, group_name, ips, redundancy, thin_provisioning, auto_quorum, logger=default_logger.__func__ ): """ Create a new SR on the given nodes. :param str group_name: The SR group_name to use. - :param list[str] node_names: String list of nodes. :param set(str) ips: Node ips. :param int redundancy: How many copy of volumes should we store? :param bool thin_provisioning: Use thin or thick provisioning. @@ -1609,7 +1608,6 @@ def create_sr( cls._start_controller(start=True) sr = cls._create_sr( group_name, - node_names, ips, redundancy, thin_provisioning, @@ -1630,7 +1628,7 @@ def create_sr( @classmethod def _create_sr( - cls, group_name, node_names, ips, redundancy, + cls, group_name, ips, redundancy, thin_provisioning, auto_quorum, logger=default_logger.__func__ ): @@ -1639,9 +1637,8 @@ def _create_sr( lin = cls._create_linstor_instance(uri, keep_uri_unmodified=True) - for node_name in node_names: - ip = ips[node_name] - + node_names = ips.keys() + for node_name, ip in ips.iteritems(): while True: # Try to create node. result = lin.node_create( From 160a10fd95846f8c3967de4defcd2380ca951f82 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Thu, 1 Jun 2023 17:40:37 +0200 Subject: [PATCH 116/133] fix(LinstorSR): compute correctly SR size using pool count Signed-off-by: Ronan Abhamon --- drivers/LinstorSR.py | 4 +-- drivers/linstorvolumemanager.py | 45 +++++++++++++++++---------------- 2 files changed, 25 insertions(+), 24 deletions(-) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index 7a9cbac6..f6c43569 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -1065,8 +1065,8 @@ def _update_stats(self, virt_alloc_delta): def _update_physical_size(self): # We use the size of the smallest disk, this is an approximation that # ensures the displayed physical size is reachable by the user. - self.physical_size = \ - self._linstor.min_physical_size * len(self._hosts) / \ + (min_physical_size, pool_count) = self._linstor.get_min_physical_size() + self.physical_size = min_physical_size * pool_count / \ self._linstor.redundancy self.physical_utilisation = self._linstor.allocated_volume_size diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py index 464ab2ce..ee637ae2 100755 --- a/drivers/linstorvolumemanager.py +++ b/drivers/linstorvolumemanager.py @@ -492,28 +492,6 @@ def physical_free_size(self): """ return self._compute_size('free_capacity') - @property - def min_physical_size(self): - """ - Give the minimum physical size of the SR. - I.e. the size of the smallest disk. - :return: The physical min size. - :rtype: int - """ - size = None - for pool in self._get_storage_pools(force=True): - space = pool.free_space - if space: - current_size = space.total_capacity - if current_size < 0: - raise LinstorVolumeManagerError( - 'Failed to get pool total_capacity attr of `{}`' - .format(pool.node_name) - ) - if size is None or current_size < size: - size = current_size - return (size or 0) * 1024 - @property def allocated_volume_size(self): """ @@ -554,6 +532,29 @@ def allocated_volume_size(self): return total_size * 1024 + def get_min_physical_size(self): + """ + Give the minimum physical size of the SR. + I.e. the size of the smallest disk + the number of pools. + :return: The physical min size. + :rtype: tuple(int, int) + """ + size = None + pool_count = 0 + for pool in self._get_storage_pools(force=True): + space = pool.free_space + if space: + pool_count += 1 + current_size = space.total_capacity + if current_size < 0: + raise LinstorVolumeManagerError( + 'Failed to get pool total_capacity attr of `{}`' + .format(pool.node_name) + ) + if size is None or current_size < size: + size = current_size + return (pool_count, (size or 0) * 1024) + @property def metadata(self): """ From d414469e2c9a3973148802817db2bbee78b2d2ca Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Tue, 6 Jun 2023 11:50:54 +0200 Subject: [PATCH 117/133] fix(blktap2): ensure we can import this module when LINSTOR is not installed Signed-off-by: Ronan Abhamon --- drivers/blktap2.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/blktap2.py b/drivers/blktap2.py index 26ab689a..ea19cca1 100755 --- a/drivers/blktap2.py +++ b/drivers/blktap2.py @@ -36,7 +36,6 @@ import xs_errors import XenAPI import scsiutil -from linstorvolumemanager import log_drbd_openers from syslog import openlog, syslog from stat import * # S_ISBLK(), ... import nfs @@ -51,6 +50,12 @@ from socket import socket, AF_UNIX, SOCK_STREAM from httplib import HTTP, HTTPConnection +try: + from linstorvolumemanager import log_drbd_openers + LINSTOR_AVAILABLE = True +except ImportError: + LINSTOR_AVAILABLE = False + PLUGIN_TAP_PAUSE = "tapdisk-pause" SOCKPATH = "/var/xapi/xcp-rrdd" @@ -831,7 +836,7 @@ def launch_on_tap(cls, blktap, path, _type, options): retry_open += 1 time.sleep(1) continue - if err == errno.EROFS: + if LINSTOR_AVAILABLE and err == errno.EROFS: log_drbd_openers(path) break try: From 57dcd41ec7a0db66e551a28f2e47bba10fa8a69e Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Wed, 21 Jun 2023 14:10:18 +0200 Subject: [PATCH 118/133] fix(LinstorSR): ensure volume cache can be recreated After SR creation we may fail to load volumes with this exception: "Failed to get usable size of..." and so we can't plug the master PBD. Regardless of the retry timeout, the only solution to fetch the usable size of the DB is to recreate the connection to the LINSTOR API. Signed-off-by: Ronan Abhamon --- drivers/LinstorSR.py | 64 +++++++++++++++++++++++++++----------------- 1 file changed, 40 insertions(+), 24 deletions(-) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index f6c43569..0bccc167 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -523,28 +523,7 @@ def connect(): if self.srcmd.cmd != 'sr_create' and self.srcmd.cmd != 'sr_detach': try: - controller_uri = get_controller_uri() - - self._journaler = LinstorJournaler( - controller_uri, self._group_name, logger=util.SMlog - ) - - # Try to open SR if exists. - # We can repair only if we are on the master AND if - # we are trying to execute an exclusive operation. - # Otherwise we could try to delete a VDI being created or - # during a snapshot. An exclusive op is the guarantee that - # the SR is locked. - self._linstor = LinstorVolumeManager( - controller_uri, - self._group_name, - repair=( - self._is_master and - self.srcmd.cmd in self.ops_exclusive - ), - logger=util.SMlog - ) - self._vhdutil = LinstorVhdUtil(self.session, self._linstor) + self._reconnect() except Exception as e: raise xs_errors.XenError('SRUnavailable', opterr=str(e)) @@ -1518,12 +1497,25 @@ def _undo_clone(self, volume_names, vdi_uuid, base_uuid, snap_uuid): # -------------------------------------------------------------------------- def _create_linstor_cache(self): + # TODO: use a nonlocal with python3. + class context: + reconnect = False + + def create_cache(): + try: + if context.reconnect: + self._reconnect() + return self._linstor.get_volumes_with_info() + except Exception as e: + context.reconnect = True + raise e + self._all_volume_metadata_cache = \ self._linstor.get_volumes_with_metadata() self._all_volume_info_cache = util.retry( - self._linstor.get_volumes_with_info, + create_cache, maxretry=10, - period=1 + period=3 ) def _destroy_linstor_cache(self): @@ -1534,6 +1526,30 @@ def _destroy_linstor_cache(self): # Misc. # -------------------------------------------------------------------------- + def _reconnect(self): + controller_uri = get_controller_uri() + + self._journaler = LinstorJournaler( + controller_uri, self._group_name, logger=util.SMlog + ) + + # Try to open SR if exists. + # We can repair only if we are on the master AND if + # we are trying to execute an exclusive operation. + # Otherwise we could try to delete a VDI being created or + # during a snapshot. An exclusive op is the guarantee that + # the SR is locked. + self._linstor = LinstorVolumeManager( + controller_uri, + self._group_name, + repair=( + self._is_master and + self.srcmd.cmd in self.ops_exclusive + ), + logger=util.SMlog + ) + self._vhdutil = LinstorVhdUtil(self.session, self._linstor) + def _ensure_space_available(self, amount_needed): space_available = self._linstor.max_volume_size_allowed if (space_available < amount_needed): From 17e72b610a529f65fe24e2f4fc6775e8237bd261 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Thu, 20 Jul 2023 10:46:33 +0200 Subject: [PATCH 119/133] fix(linstor-manager): remove dead/useless code in add/remove_host helpers Signed-off-by: Ronan Abhamon --- drivers/linstor-manager | 128 +++++----------------------------------- 1 file changed, 15 insertions(+), 113 deletions(-) diff --git a/drivers/linstor-manager b/drivers/linstor-manager index 9e96aaca..45201eed 100755 --- a/drivers/linstor-manager +++ b/drivers/linstor-manager @@ -555,7 +555,7 @@ def has_controller_running(session, args): def add_host(session, args): group_name = args['groupName'] - # 1. Find SR and PBDs. + # 1. Find all LINSTOR SRs and PBDs. srs = dict() for sr_ref, sr in session.xenapi.SR.get_all_records().items(): if sr.get('type') == 'linstor': @@ -597,7 +597,6 @@ def add_host(session, args): node_name = socket.gethostname() has_node = linstor.has_node(node_name) - pbd_id = 0 new_pbd_ref = None try: @@ -610,66 +609,30 @@ def add_host(session, args): if not has_node: linstor.create_node(node_name, util.get_this_host_address(session)) - # 6. Recreate PBDs. - # Use the redundancy given by Linstor instead of smapi config. - redundancy = linstor.redundancy - default_device_config = None + # 6. Try to create PBD. this_host = util.get_this_host_ref(session) create_new_pbd = True assert pbds - pbds = pbds.items() - for pbd_ref, pbd in pbds: - device_config = pbd['device_config'] - - hosts = filter( - lambda host: len(host.strip()), - device_config.get('hosts', []).split(',') - ) - hosts.append(node_name) - hosts = ','.join(list(set(hosts))) + for pbd in pbds.values(): + if pbd['host'] == this_host: + create_new_pbd = False + break + device_config = pbd['device_config'] # Should be the same on all hosts. provisioning = device_config['provisioning'] - if not default_device_config: - default_device_config = { - 'group-name': group_name, - 'redundancy': redundancy, - 'hosts': hosts, - 'provisioning': provisioning - } - - if pbd['currently_attached']: - session.xenapi.PBD.unplug(pbd_ref) - session.xenapi.PBD.destroy(pbd_ref) - pbd_id += 1 - - host = pbd['host'] - if host == this_host: - create_new_pbd = False - - pbd_ref = session.xenapi.PBD.create({ - 'host': host, - 'SR': sr_ref, - 'device_config': { - 'group-name': group_name, - 'redundancy': redundancy, - 'hosts': hosts, - 'provisioning': provisioning - } - }) - try: - session.xenapi.PBD.plug(pbd_ref) - except Exception as e: - util.SMlog('Failed to replug PBD: {}'.format(e)) - # 7. Create new PBD. if create_new_pbd: new_pbd_ref = session.xenapi.PBD.create({ 'host': this_host, 'SR': sr_ref, - 'device_config': default_device_config + 'device_config': { + 'group-name': group_name, + 'redundancy': linstor.redundancy, + 'provisioning': provisioning + } }) try: session.xenapi.PBD.plug(new_pbd_ref) @@ -685,38 +648,6 @@ def add_host(session, args): except Exception: pass - for pbd_ref, pbd in pbds[:pbd_id]: - try: - session.xenapi.PBD.unplug(pbd_ref) - except Exception: - pass - - try: - session.xenapi.PBD.destroy(pbd_ref) - except Exception: - pass - - try: - device_config = pbd['device_config'] - session.xenapi.PBD.create({ - 'host': host, - 'SR': sr_ref, - 'device_config': { - 'group-name': group_name, - 'redundancy': redundancy, - 'hosts': device_config['hosts'], - 'provisioning': device_config['provisioning'] - } - }) - except Exception as pbd_error: - util.SMlog('Failed to recreate PBD: {}'.format(pbd_error)) - pass - - try: - session.xenapi.PBD.plug(pbd_ref) - except Exception: - pass - if new_pbd_ref: try: session.xenapi.PBD.unplug(new_pbd_ref) @@ -743,7 +674,7 @@ def add_host(session, args): def remove_host(session, args): group_name = args['groupName'] - # 1. Find SRs and PBDs. + # 1. Find all LINSTOR SRs and PBDs. srs = dict() for sr_ref, sr in session.xenapi.SR.get_all_records().items(): if sr.get('type') == 'linstor': @@ -772,45 +703,16 @@ def remove_host(session, args): if linstor.has_node(node_name): raise Exception('Failed to remove node! Unknown error.') - redundancy = linstor.redundancy this_host = util.get_this_host_ref(session) - # 3. Update PBDs. + # 3. Remove PBD. for pbd_ref, pbd in pbds.items(): host = pbd['host'] if host == this_host: if pbd['currently_attached']: session.xenapi.PBD.unplug(pbd_ref) session.xenapi.PBD.destroy(pbd_ref) - continue - - device_config = pbd['device_config'] - hosts = device_config.get('hosts', []).split(',') - try: - hosts.remove(node_name) - except Exception as e: - continue - hosts = ','.join(list(set(hosts))) - - if pbd['currently_attached']: - session.xenapi.PBD.unplug(pbd_ref) - session.xenapi.PBD.destroy(pbd_ref) - - pbd_ref = session.xenapi.PBD.create({ - 'host': host, - 'SR': pbd['SR'], - 'device_config': { - 'group-name': group_name, - 'redundancy': redundancy, - 'hosts': hosts, - 'provisioning': device_config['provisioning'] - } - }) - - try: - session.xenapi.PBD.plug(pbd_ref) - except Exception as e: - util.SMlog('Failed to replug PBD: {}'.format(e)) + break # 3. Stop services. try: From d5044703c34c31524291bad6eae25afeb1d71915 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Tue, 1 Aug 2023 15:16:17 +0200 Subject: [PATCH 120/133] fix(LinstorSR): Ensure we always have a device path during leaf-coalesce calls So we must not verify that we have a valid DRBD path in the load step, it can fail on many hosts, instead we must create a diskless path only during the real coalesce. Note: I removed this assert: `assert virtual_size >= volume_size`, it seems that it's not always true, I suppose the volume size can be greater than expected due to a bigger allocation in the LVM or DRBD layer. Signed-off-by: Ronan Abhamon --- drivers/cleanup.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/cleanup.py b/drivers/cleanup.py index bf0cc46e..8f40d45e 100755 --- a/drivers/cleanup.py +++ b/drivers/cleanup.py @@ -1394,11 +1394,6 @@ def load(self, info=None): self.fileName = self.sr._linstor.get_volume_name(self.uuid) self.path = self.sr._linstor.build_device_path(self.fileName) - if not util.pathexists(self.path): - raise util.SMException( - '{} of {} not found' - .format(self.fileName, self.uuid) - ) if not info: try: @@ -3020,8 +3015,6 @@ def _calcExtraSpaceNeeded(self, child, parent): parent.sizeVirt + meta_overhead + bitmap_overhead ) volume_size = self._linstor.get_volume_size(parent.uuid) - - assert virtual_size >= volume_size return virtual_size - volume_size def _hasValidDevicePath(self, uuid): @@ -3042,6 +3035,11 @@ def _liveLeafCoalesce(self, vdi): finally: self.unlock() + def _prepareCoalesceLeaf(self, vdi): + # Move diskless path if necessary. We must have an access + # to modify locally the volume. + self._linstor.get_device_path(vdi.uuid) + def _handleInterruptedCoalesceLeaf(self): entries = self.journaler.get_all(VDI.JRN_LEAF) for uuid, parentUuid in entries.iteritems(): From 0cd226ebfe91d85f87f95fc87782cbc10b29cefc Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Wed, 16 Aug 2023 12:04:01 +0200 Subject: [PATCH 121/133] fix(LinstorSR): always use lock.acquire() during attach/detach We can't use a retry range on the lock because we can trigger a bad situation in the detach step... When the GC has a lock on the SR and we try to acquire the same lock in a detach call, and if we can't get this lock after 20 seconds, the consequences are very bad: - Many tapdisk instances of the same VDI can be created on two hosts - The VDI info are not updated correctly - And this issue is not immediately visible Signed-off-by: Ronan Abhamon --- drivers/LinstorSR.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index 0bccc167..98919a48 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -142,14 +142,6 @@ def compute_volume_size(virtual_size, image_type): return LinstorVolumeManager.round_up_volume_size(virtual_size) -def try_lock(lock): - for i in range(20): - if lock.acquireNoblock(): - return - time.sleep(1) - raise util.SRBusyException() - - def attach_thin(session, journaler, linstor, sr_uuid, vdi_uuid): volume_metadata = linstor.get_volume_metadata(vdi_uuid) image_type = volume_metadata.get(VDI_TYPE_TAG) @@ -158,7 +150,7 @@ def attach_thin(session, journaler, linstor, sr_uuid, vdi_uuid): lock = Lock(vhdutil.LOCK_TYPE_SR, sr_uuid) try: - try_lock(lock) + lock.acquire() device_path = linstor.get_device_path(vdi_uuid) @@ -191,7 +183,7 @@ def detach_thin(session, linstor, sr_uuid, vdi_uuid): lock = Lock(vhdutil.LOCK_TYPE_SR, sr_uuid) try: - try_lock(lock) + lock.acquire() vdi_ref = session.xenapi.VDI.get_by_uuid(vdi_uuid) vbds = session.xenapi.VBD.get_all_records_where( From 86766d554a4b5a76fe1413635c991e7a92a41c09 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Thu, 17 Aug 2023 14:52:13 +0200 Subject: [PATCH 122/133] fix(LinstorSR): mare sure hostnames are unique at SR creation Signed-off-by: Ronan Abhamon --- drivers/LinstorSR.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index 98919a48..e512487c 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -649,6 +649,12 @@ def create(self, uuid, size): hostname = record['hostname'] ips[hostname] = record['address'] + if len(ips) != len(online_hosts): + raise xs_errors.XenError( + 'LinstorSRCreate', + opterr='Multiple hosts with same hostname' + ) + # Ensure ports are opened and LINSTOR satellites # are activated. In the same time the drbd-reactor instances # must be stopped. From 9c2716172354442aafbc9fc4627bf976f6bba250 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Fri, 18 Aug 2023 11:06:56 +0200 Subject: [PATCH 123/133] fix(LinstorSR): ensure we can attach non-special static VDIs Signed-off-by: Ronan Abhamon --- drivers/LinstorSR.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index e512487c..17083703 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -461,6 +461,9 @@ def create_linstor(uri, attempt_count=30): logger=util.SMlog, attempt_count=attempt_count ) + # Only required if we are attaching from config using a non-special VDI. + # I.e. not an HA volume. + self._vhdutil = LinstorVhdUtil(self.session, self._linstor) controller_uri = get_controller_uri() if controller_uri: From f534d4a2dfd31a8b5b4c36b09fbd128189bd8574 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Thu, 31 Aug 2023 18:00:27 +0200 Subject: [PATCH 124/133] fix(LinstorSR): ensure we can detach when deflate call is not possible Signed-off-by: Ronan Abhamon --- drivers/LinstorSR.py | 45 +++++++++++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 15 deletions(-) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index 17083703..a6ca8840 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -175,7 +175,7 @@ def attach_thin(session, journaler, linstor, sr_uuid, vdi_uuid): lock.release() -def detach_thin(session, linstor, sr_uuid, vdi_uuid): +def detach_thin_impl(session, linstor, sr_uuid, vdi_uuid): volume_metadata = linstor.get_volume_metadata(vdi_uuid) image_type = volume_metadata.get(VDI_TYPE_TAG) if image_type == vhdutil.VDI_TYPE_RAW: @@ -185,21 +185,26 @@ def detach_thin(session, linstor, sr_uuid, vdi_uuid): try: lock.acquire() - vdi_ref = session.xenapi.VDI.get_by_uuid(vdi_uuid) - vbds = session.xenapi.VBD.get_all_records_where( - 'field "VDI" = "{}"'.format(vdi_ref) - ) + def check_vbd_count(): + vdi_ref = session.xenapi.VDI.get_by_uuid(vdi_uuid) + vbds = session.xenapi.VBD.get_all_records_where( + 'field "VDI" = "{}"'.format(vdi_ref) + ) - num_plugged = 0 - for vbd_rec in vbds.values(): - if vbd_rec['currently_attached']: - num_plugged += 1 - if num_plugged > 1: - raise xs_errors.XenError( - 'VDIUnavailable', - opterr='Cannot deflate VDI {}, already used by ' - 'at least 2 VBDs'.format(vdi_uuid) - ) + num_plugged = 0 + for vbd_rec in vbds.values(): + if vbd_rec['currently_attached']: + num_plugged += 1 + if num_plugged > 1: + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='Cannot deflate VDI {}, already used by ' + 'at least 2 VBDs'.format(vdi_uuid) + ) + + # We can have multiple VBDs attached to a VDI during a VM-template clone. + # So we use a timeout to ensure that we can detach the volume properly. + util.retry(check_vbd_count, maxretry=10, period=1) device_path = linstor.get_device_path(vdi_uuid) new_volume_size = LinstorVolumeManager.round_up_volume_size( @@ -217,6 +222,16 @@ def detach_thin(session, linstor, sr_uuid, vdi_uuid): lock.release() +def detach_thin(session, linstor, sr_uuid, vdi_uuid): + # This function must always return without errors. + # Otherwise it could cause errors in the XAPI regarding the state of the VDI. + # It's why we use this `try` block. + try: + detach_thin_impl(session, linstor, sr_uuid, vdi_uuid) + except Exception as e: + util.SMlog('Failed to detach properly VDI {}: {}'.format(vdi_uuid, e)) + + def inflate(journaler, linstor, vdi_uuid, vdi_path, new_size, old_size): # Only inflate if the LINSTOR volume capacity is not enough. new_size = LinstorVolumeManager.round_up_volume_size(new_size) From f0535fda8ffc2dc99e0d01174b6ca454caa7f8fc Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Thu, 28 Sep 2023 16:00:02 +0200 Subject: [PATCH 125/133] fix(LinstorSR): assume VDI is always a VHD when the info is missing during cleanup Signed-off-by: Ronan Abhamon --- drivers/cleanup.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/cleanup.py b/drivers/cleanup.py index 8f40d45e..be61bfec 100755 --- a/drivers/cleanup.py +++ b/drivers/cleanup.py @@ -2990,11 +2990,11 @@ def _load_vdi_info(self): not list(volumes_metadata[vdi_uuid].items()): continue # Ignore it, probably deleted. - vdi_type = volumes_metadata[vdi_uuid][VDI_TYPE_TAG] - if vdi_type == vhdutil.VDI_TYPE_VHD: - info = self._vhdutil.get_vhd_info(vdi_uuid) - else: + vdi_type = volumes_metadata[vdi_uuid].get(VDI_TYPE_TAG) + if vdi_type == vhdutil.VDI_TYPE_RAW: info = None + else: + info = self._vhdutil.get_vhd_info(vdi_uuid) except Exception as e: Util.log( ' [VDI {}: failed to load VDI info]: {}' From b2bef4a41ccd46d2f7f6266333d07ce40977dc8a Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Mon, 2 Oct 2023 16:48:49 +0200 Subject: [PATCH 126/133] fix(LinstorSR): remove SR lock during thin attach/detach This lock is normally useless and can create a dead lock when thin mode is activated: - A task try to deactivate a volume during a VM shutdown on a slave (so a VDI A is locked). Then a new task is created on the master host, we try to get the SR lock on the master. - In parallel a tap-pause is asked from the master to the slave, the master SR lock is now locked. The tap-pause request is received on the slave, but we can't lock VDI A because it's already locked. So to resume: a dead lock is only possible if we try to shutdown a VM with a particular VDI and if we try to snapshot it in the same time. Signed-off-by: Ronan Abhamon --- drivers/LinstorSR.py | 102 +++++++++++++++++++------------------------ 1 file changed, 45 insertions(+), 57 deletions(-) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index a6ca8840..ed41e77a 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -148,32 +148,26 @@ def attach_thin(session, journaler, linstor, sr_uuid, vdi_uuid): if image_type == vhdutil.VDI_TYPE_RAW: return - lock = Lock(vhdutil.LOCK_TYPE_SR, sr_uuid) - try: - lock.acquire() + device_path = linstor.get_device_path(vdi_uuid) + + # If the virtual VHD size is lower than the LINSTOR volume size, + # there is nothing to do. + vhd_size = compute_volume_size( + # TODO: Replace pylint comment with this feature when possible: + # https://github.com/PyCQA/pylint/pull/2926 + LinstorVhdUtil(session, linstor).get_size_virt(vdi_uuid), # pylint: disable = E1120 + image_type + ) - device_path = linstor.get_device_path(vdi_uuid) + volume_info = linstor.get_volume_info(vdi_uuid) + volume_size = volume_info.virtual_size - # If the virtual VHD size is lower than the LINSTOR volume size, - # there is nothing to do. - vhd_size = compute_volume_size( - # TODO: Replace pylint comment with this feature when possible: - # https://github.com/PyCQA/pylint/pull/2926 - LinstorVhdUtil(session, linstor).get_size_virt(vdi_uuid), # pylint: disable = E1120 - image_type + if vhd_size > volume_size: + inflate( + journaler, linstor, vdi_uuid, device_path, + vhd_size, volume_size ) - volume_info = linstor.get_volume_info(vdi_uuid) - volume_size = volume_info.virtual_size - - if vhd_size > volume_size: - inflate( - journaler, linstor, vdi_uuid, device_path, - vhd_size, volume_size - ) - finally: - lock.release() - def detach_thin_impl(session, linstor, sr_uuid, vdi_uuid): volume_metadata = linstor.get_volume_metadata(vdi_uuid) @@ -181,45 +175,39 @@ def detach_thin_impl(session, linstor, sr_uuid, vdi_uuid): if image_type == vhdutil.VDI_TYPE_RAW: return - lock = Lock(vhdutil.LOCK_TYPE_SR, sr_uuid) - try: - lock.acquire() - - def check_vbd_count(): - vdi_ref = session.xenapi.VDI.get_by_uuid(vdi_uuid) - vbds = session.xenapi.VBD.get_all_records_where( - 'field "VDI" = "{}"'.format(vdi_ref) - ) + def check_vbd_count(): + vdi_ref = session.xenapi.VDI.get_by_uuid(vdi_uuid) + vbds = session.xenapi.VBD.get_all_records_where( + 'field "VDI" = "{}"'.format(vdi_ref) + ) - num_plugged = 0 - for vbd_rec in vbds.values(): - if vbd_rec['currently_attached']: - num_plugged += 1 - if num_plugged > 1: - raise xs_errors.XenError( - 'VDIUnavailable', - opterr='Cannot deflate VDI {}, already used by ' - 'at least 2 VBDs'.format(vdi_uuid) - ) + num_plugged = 0 + for vbd_rec in vbds.values(): + if vbd_rec['currently_attached']: + num_plugged += 1 + if num_plugged > 1: + raise xs_errors.XenError( + 'VDIUnavailable', + opterr='Cannot deflate VDI {}, already used by ' + 'at least 2 VBDs'.format(vdi_uuid) + ) - # We can have multiple VBDs attached to a VDI during a VM-template clone. - # So we use a timeout to ensure that we can detach the volume properly. - util.retry(check_vbd_count, maxretry=10, period=1) + # We can have multiple VBDs attached to a VDI during a VM-template clone. + # So we use a timeout to ensure that we can detach the volume properly. + util.retry(check_vbd_count, maxretry=10, period=1) - device_path = linstor.get_device_path(vdi_uuid) - new_volume_size = LinstorVolumeManager.round_up_volume_size( - # TODO: Replace pylint comment with this feature when possible: - # https://github.com/PyCQA/pylint/pull/2926 - LinstorVhdUtil(session, linstor).get_size_phys(vdi_uuid) # pylint: disable = E1120 - ) + device_path = linstor.get_device_path(vdi_uuid) + new_volume_size = LinstorVolumeManager.round_up_volume_size( + # TODO: Replace pylint comment with this feature when possible: + # https://github.com/PyCQA/pylint/pull/2926 + LinstorVhdUtil(session, linstor).get_size_phys(vdi_uuid) # pylint: disable = E1120 + ) - volume_info = linstor.get_volume_info(vdi_uuid) - old_volume_size = volume_info.virtual_size - deflate( - linstor, vdi_uuid, device_path, new_volume_size, old_volume_size - ) - finally: - lock.release() + volume_info = linstor.get_volume_info(vdi_uuid) + old_volume_size = volume_info.virtual_size + deflate( + linstor, vdi_uuid, device_path, new_volume_size, old_volume_size + ) def detach_thin(session, linstor, sr_uuid, vdi_uuid): From 51867a0711f6985a60ab11772110b1bc0304a860 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Tue, 3 Oct 2023 18:42:42 +0200 Subject: [PATCH 127/133] fix(LinstorSR): ensure database is mounted during scan Signed-off-by: Ronan Abhamon --- drivers/LinstorSR.py | 15 +++++++++++++++ drivers/linstorvolumemanager.py | 10 +++++++++- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index ed41e77a..ed5998e8 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -824,6 +824,21 @@ def scan(self, uuid): if self.vdis[vdi_uuid].deleted: del self.vdis[vdi_uuid] + # Security to prevent VDIs from being forgotten if the controller + # is started without a shared and mounted /var/lib/linstor path. + try: + self._linstor.get_database_path() + except Exception: + # Failed to get database path, ensure we don't have + # VDIs in the XAPI database... + if self.session.xenapi.SR.get_VDIs( + self.session.xenapi.SR.get_by_uuid(self.uuid) + ): + raise xs_errors.XenError( + 'SRUnavailable', + opterr='Database is not mounted' + ) + # Update the database before the restart of the GC to avoid # bad sync in the process if new VDIs have been introduced. ret = super(LinstorSR, self).scan(self.uuid) diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py index ee637ae2..f1f3bce7 100755 --- a/drivers/linstorvolumemanager.py +++ b/drivers/linstorvolumemanager.py @@ -884,7 +884,7 @@ def get_volume_info(self, volume_uuid): def get_device_path(self, volume_uuid): """ - Get the dev path of a volume. + Get the dev path of a volume, create a diskless if necessary. :param str volume_uuid: The volume uuid to get the dev path. :return: The current device path of the volume. :rtype: str @@ -1587,6 +1587,14 @@ def get_resources_info(self): return resources + def get_database_path(self): + """ + Get the database path. + :return: The current database path. + :rtype: str + """ + return self._request_database_path(self._linstor) + @classmethod def create_sr( cls, group_name, ips, redundancy, From 2b83f39dbefc675b7a05d4c8a4b66c82d474f250 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Wed, 4 Oct 2023 14:30:36 +0200 Subject: [PATCH 128/133] fix(LinstorSR): restart drbd-reactor in case of failure Otherwise we can have all hosts unusable after a massive reboot: all the drbd-reactor services are marked as failed, and no controller is started. Signed-off-by: Ronan Abhamon --- Makefile | 3 +++ etc/systemd/system/drbd-reactor.service.d/override.conf | 7 +++++++ 2 files changed, 10 insertions(+) create mode 100644 etc/systemd/system/drbd-reactor.service.d/override.conf diff --git a/Makefile b/Makefile index f8196cb1..42cdfa93 100755 --- a/Makefile +++ b/Makefile @@ -148,6 +148,7 @@ install: precheck mkdir -p $(SM_STAGING)$(UDEV_SCRIPTS_DIR) mkdir -p $(SM_STAGING)$(INIT_DIR) mkdir -p $(SM_STAGING)$(SYSTEMD_CONF_DIR) + mkdir -p $(SM_STAGING)$(SYSTEMD_CONF_DIR)/drbd-reactor.service.d mkdir -p $(SM_STAGING)$(SYSTEMD_CONF_DIR)/linstor-satellite.service.d mkdir -p $(SM_STAGING)$(SYSTEMD_SERVICE_DIR) mkdir -p $(SM_STAGING)$(MPATH_CONF_DIR) @@ -177,6 +178,8 @@ install: precheck $(SM_STAGING)/$(SM_DEST) install -m 644 etc/logrotate.d/$(SMLOG_CONF) \ $(SM_STAGING)/$(LOGROTATE_DIR) + install -m 644 etc/systemd/system/drbd-reactor.service.d/override.conf \ + $(SM_STAGING)/$(SYSTEMD_CONF_DIR)/drbd-reactor.service.d/ install -m 644 etc/systemd/system/linstor-satellite.service.d/override.conf \ $(SM_STAGING)/$(SYSTEMD_CONF_DIR)/linstor-satellite.service.d/ install -m 644 etc/systemd/system/var-lib-linstor.service \ diff --git a/etc/systemd/system/drbd-reactor.service.d/override.conf b/etc/systemd/system/drbd-reactor.service.d/override.conf new file mode 100644 index 00000000..c079ab62 --- /dev/null +++ b/etc/systemd/system/drbd-reactor.service.d/override.conf @@ -0,0 +1,7 @@ +[Unit] +StartLimitInterval=60 +StartLimitBurst=10 + +[Service] +Restart=always +RestartSec=2 From e1c71683008f2d2008638a2ce05f6a344d2a77f1 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Mon, 9 Oct 2023 10:37:32 +0200 Subject: [PATCH 129/133] fix(linstorvolumemanager): retry in case of failure during mkfs call on database The device is not always ready after creation. So we must retry the mkfs call in case of failure. Signed-off-by: Ronan Abhamon --- drivers/linstorvolumemanager.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py index f1f3bce7..23e80d91 100755 --- a/drivers/linstorvolumemanager.py +++ b/drivers/linstorvolumemanager.py @@ -2666,7 +2666,10 @@ def _create_database_volume( ) try: - util.pread2([DATABASE_MKFS, expected_device_path]) + util.retry( + lambda: util.pread2([DATABASE_MKFS, expected_device_path]), + maxretry=5 + ) except Exception as e: raise LinstorVolumeManagerError( 'Failed to execute {} on database volume: {}' From e72530a756b249368e0585b4a63af3d5e2cf1d09 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Tue, 26 Sep 2023 11:48:38 +0200 Subject: [PATCH 130/133] fix(linstorvolumemanager): avoid diskless creation when a new resource is added Like said in this discussion https://github.com/xcp-ng/sm/pull/34 : "Instead of using diskless_on_remaing, only place a resource on demand on a node. If a cluster would ever exceed 20 nodes, having diskless on all of them might get problematic, as all of them are part of the quorum voting." Signed-off-by: Ronan Abhamon --- drivers/linstorvolumemanager.py | 51 ++++++--------------------------- 1 file changed, 9 insertions(+), 42 deletions(-) diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py index 23e80d91..49ca83c0 100755 --- a/drivers/linstorvolumemanager.py +++ b/drivers/linstorvolumemanager.py @@ -614,8 +614,7 @@ def check_volume_exists(self, volume_uuid): return volume_uuid in self._volumes def create_volume( - self, volume_uuid, size, persistent=True, volume_name=None, - no_diskless=False + self, volume_uuid, size, persistent=True, volume_name=None ): """ Create a new volume on the SR. @@ -625,8 +624,6 @@ def create_volume( on the next constructor call LinstorSR(...). :param str volume_name: If set, this name is used in the LINSTOR database instead of a generated name. - :param bool no_diskless: If set, the default group redundancy is not - used, instead the volume is created on all nodes. :return: The current device path of the volume. :rtype: str """ @@ -635,8 +632,7 @@ def create_volume( if not volume_name: volume_name = self.build_volume_name(util.gen_uuid()) volume_properties = self._create_volume_with_properties( - volume_uuid, volume_name, size, place_resources=True, - no_diskless=no_diskless + volume_uuid, volume_name, size, place_resources=True ) # Volume created! Now try to find the device path. @@ -1295,8 +1291,7 @@ def create(): # Note: placed outside try/except block because we create only definition first. # There is no reason to call `clean` before the real resource creation. volume_properties = self._create_volume_with_properties( - clone_uuid, clone_volume_name, size, - place_resources=False + clone_uuid, clone_volume_name, size, place_resources=False ) # After this point, `clean` can be called for any fail because the clone UUID @@ -1758,7 +1753,7 @@ def _create_sr( name=group_name, place_count=redundancy, storage_pool=group_name, - diskless_on_remaining=True + diskless_on_remaining=False ) error_str = cls._get_error_str(result) if error_str: @@ -2062,28 +2057,11 @@ def _get_storage_pools(self, force=False): return self._storage_pools def _create_volume( - self, volume_uuid, volume_name, size, place_resources, - no_diskless=False + self, volume_uuid, volume_name, size, place_resources ): - if no_diskless and not place_resources: - raise LinstorVolumeManagerError( - 'Could not create volume `{}` from SR `{}`: it\'s impossible ' - .format(volume_uuid, self._group_name) + - 'to force no diskless without placing resources' - ) - size = self.round_up_volume_size(size) self._mark_resource_cache_as_dirty() - resources = [] - if no_diskless: - for node_name in self._get_node_names(): - resources.append(linstor.ResourceData( - node_name=node_name, - rsc_name=volume_name, - storage_pool=self._group_name - )) - def create_definition(): self._check_volume_creation_errors( self._linstor.resource_group_spawn( @@ -2109,23 +2087,13 @@ def clean(): def create(): try: create_definition() - if no_diskless: - # Create a physical resource on each node. - result = self._linstor.resource_create(resources) - error_str = self._get_error_str(result) - if error_str: - raise LinstorVolumeManagerError( - 'Could not create volume `{}` from SR `{}`: {}'.format( - volume_uuid, self._group_name, error_str - ) - ) - elif place_resources: + if place_resources: # Basic case when we use the default redundancy of the group. self._check_volume_creation_errors( self._linstor.resource_auto_place( rsc_name=volume_name, place_count=self._redundancy, - diskless_on_remaining=not no_diskless + diskless_on_remaining=False ), volume_uuid, self._group_name @@ -2141,8 +2109,7 @@ def create(): util.retry(create, maxretry=5) def _create_volume_with_properties( - self, volume_uuid, volume_name, size, place_resources, - no_diskless=False + self, volume_uuid, volume_name, size, place_resources ): if self.check_volume_exists(volume_uuid): raise LinstorVolumeManagerError( @@ -2171,7 +2138,7 @@ def _create_volume_with_properties( volume_properties[self.PROP_VOLUME_NAME] = volume_name self._create_volume( - volume_uuid, volume_name, size, place_resources, no_diskless + volume_uuid, volume_name, size, place_resources ) assert volume_properties.namespace == \ From 7ac370254018450fcc3e27b36014d4b23e334c63 Mon Sep 17 00:00:00 2001 From: Rene Peinthor Date: Tue, 25 Jul 2023 11:19:39 +0200 Subject: [PATCH 131/133] fix(LinstorSR): remove diskless after VDI.detach calls Signed-off-by: Rene Peinthor Co-authored-by: Ronan Abhamon --- drivers/LinstorSR.py | 20 ++++++++++++++++++++ drivers/linstorvolumemanager.py | 26 +++++++++++++++++++++++--- 2 files changed, 43 insertions(+), 3 deletions(-) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index ed5998e8..7b9a8e57 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -1943,6 +1943,26 @@ def detach(self, sr_uuid, vdi_uuid): .format(e) ) + # We remove only on slaves because the volume can be used by the GC. + if self.sr._is_master: + return + + while vdi_uuid: + try: + path = self._linstor.build_device_path(self._linstor.get_volume_name(vdi_uuid)) + parent_vdi_uuid = self.sr._vhdutil.get_vhd_info(vdi_uuid).parentUuid + except Exception: + break + + if util.pathexists(path): + try: + self._linstor.remove_volume_if_diskless(vdi_uuid) + except Exception as e: + # Ensure we can always detach properly. + # I don't want to corrupt the XAPI info. + util.SMlog('Failed to clean VDI {} during detach: {}'.format(vdi_uuid, e)) + vdi_uuid = parent_vdi_uuid + def resize(self, sr_uuid, vdi_uuid, size): util.SMlog('LinstorVDI.resize for {}'.format(self.uuid)) if not self.sr._is_master: diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py index 49ca83c0..50900d99 100755 --- a/drivers/linstorvolumemanager.py +++ b/drivers/linstorvolumemanager.py @@ -781,6 +781,28 @@ def ensure_volume_list_is_not_locked(self, volume_uuids, timeout=None): if waiting: self._logger('No volume locked now!') + def remove_volume_if_diskless(self, volume_uuid): + """ + Remove disless path from local node. + :param str volume_uuid: The volume uuid to remove. + """ + + self._ensure_volume_exists(volume_uuid) + + volume_properties = self._get_volume_properties(volume_uuid) + volume_name = volume_properties.get(self.PROP_VOLUME_NAME) + + node_name = socket.gethostname() + result = self._linstor.resource_delete_if_diskless( + node_name=node_name, rsc_name=volume_name + ) + if not linstor.Linstor.all_api_responses_no_error(result): + raise LinstorVolumeManagerError( + 'Unable to delete diskless path of `{}` on node `{}`: {}' + .format(volume_name, node_name, ', '.join( + [str(x) for x in result])) + ) + def introduce_volume(self, volume_uuid): pass # TODO: Implement me. @@ -2459,9 +2481,7 @@ def _configure_volume_peer_slots(cls, lin, volume_name): @classmethod def _activate_device_path(cls, lin, node_name, volume_name): - result = lin.resource_create([ - linstor.ResourceData(node_name, volume_name, diskless=True) - ]) + result = lin.resource_make_available(node_name, volume_name, diskful=True) if linstor.Linstor.all_api_responses_no_error(result): return errors = linstor.Linstor.filter_api_call_response_errors(result) From 70cf88ca9d7ad96058444229d6fb350fcdf84eb8 Mon Sep 17 00:00:00 2001 From: Ronan Abhamon Date: Wed, 11 Oct 2023 12:39:56 +0200 Subject: [PATCH 132/133] fix(LinstorSR): robustify _load_vdi_info in cleanup.py After a failed snapshot like that: ``` Sep 18 12:02:47 xcp1 SM: [909] ['/usr/bin/vhd-util', 'snapshot', '--debug', '-n', '/dev/drbd/by-res/xcp-volume-bbf05b73-bbad-438e-a194-3470558c8a8d/0', '-p', '/dev/drbd/by-res/xcp-volume-432905e3-57ba-41c7-8517-3331e1907c69/0', '-S', '2097152'] Sep 18 12:02:49 xcp1 SM: [909] FAILED in util.pread: (rc 30) stdout: '', stderr: '/dev/drbd/by-res/xcp-volume-bbf05b73-bbad-438e-a194-3470558c8a8d/0: failed to create: -30 Sep 18 12:02:50 xcp1 SM: [909] raise opener exception: cmd: `['/usr/bin/vhd-util', 'snapshot', '--debug', '-n', '/dev/drbd/by-res/xcp-volume-bbf05b73-bbad-438e-a194-3470558c8a8d/0', '-p', '/dev/drbd/by-res/xcp-volume-432905e3-57ba-41c7-8517-3331e1907c69/0', '-S', '2097152']`, code: `30`, reason: `/dev/drbd/by-res/xcp-volume-bbf05b73-bbad-438e-a194-3470558c8a8d/0: failed to create: -30` (openers: {'xcp3': {}, 'xcp2': {}, 'xcp1': {}}) Sep 18 12:02:50 xcp1 SM: [909] ***** Failed to snapshot!: EXCEPTION , cmd: `['/usr/bin/vhd-util', 'snapshot', '--debug', '-n', '/dev/drbd/by-res/xcp-volume-bbf05b73-bbad-438e-a194-3470558c8a8d/0', '-p', '/dev/drbd/by-res/xcp-volume-432905e3-57ba-41c7-8517-3331e1907c69/0', '-S', '2097152']`, code: `30`, reason: `/dev/drbd/by-res/xcp-volume-bbf05b73-bbad-438e-a194-3470558c8a8d/0: failed to create: -30` (openers: {'xcp3': {}, 'xcp2': {}, 'xcp1': {}}) Sep 18 12:02:50 xcp1 SM: [909] Cannot destroy VDI 4e1ac2a2-3d57-408f-92a8-ccc03882511f during undo clone: Cannot destroy volume `4e1ac2a2-3d57-408f-92a8-ccc03882511f`: Could not destroy resource `xcp-volume-bbf05b73-bbad-438e-a194-3470558c8a8d` from SR `xcp-sr-linstor_group_thin_device`: Resource 'xcp-volume-bbf05b73-bbad-438e-a194-3470558c8a8d' on node 'xcp2' is still in use. Sep 18 12:02:50 xcp1 SM: [909] Trying to update volume UUID 4e1ac2a2-3d57-408f-92a8-ccc03882511f to DELETED_4e1ac2a2-3d57-408f-92a8-ccc03882511f... ``` The remaining volume can be empty, so we must ignore all volumes with the `DELETED_` prefix. These are not valid VHDs after all. And also we must be sure to never set the RAW flag on corrupted VHDs during cleanup. Signed-off-by: Ronan Abhamon --- drivers/cleanup.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/drivers/cleanup.py b/drivers/cleanup.py index be61bfec..346180f5 100755 --- a/drivers/cleanup.py +++ b/drivers/cleanup.py @@ -2991,10 +2991,21 @@ def _load_vdi_info(self): continue # Ignore it, probably deleted. vdi_type = volumes_metadata[vdi_uuid].get(VDI_TYPE_TAG) - if vdi_type == vhdutil.VDI_TYPE_RAW: - info = None - else: + if vdi_type == vhdutil.VDI_TYPE_VHD: info = self._vhdutil.get_vhd_info(vdi_uuid) + elif not vdi_uuid.startswith('DELETED_'): + # Ensure it's not a VHD... + try: + info = self._vhdutil.get_vhd_info(vdi_uuid) + except: + try: + self.repair(vdi_uuid) + info = self._vhdutil.get_vhd_info(vdi_uuid) + except: + info = None + else: + # Assume it's really a RAW volume of a failed snap without VHD header/footer. + info = None except Exception as e: Util.log( ' [VDI {}: failed to load VDI info]: {}' From 0e8cc8dccecb87511d96c446ca38cc5347fb41a0 Mon Sep 17 00:00:00 2001 From: BenjiReis Date: Fri, 20 Oct 2023 11:42:18 +0200 Subject: [PATCH 133/133] Increase DRBD Net/ping-timeout This would avoid fake dead node assumption Signed-off-by: BenjiReis --- drivers/LinstorSR.py | 4 ++++ drivers/linstorvolumemanager.py | 17 +++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/drivers/LinstorSR.py b/drivers/LinstorSR.py index 7b9a8e57..d3c15b6c 100755 --- a/drivers/LinstorSR.py +++ b/drivers/LinstorSR.py @@ -1773,6 +1773,10 @@ def create(self, sr_uuid, vdi_uuid, size): if self.ty == 'ha_statefile' or self.ty == 'redo_log': self._linstor.set_auto_promote_timeout(self.uuid, 600) + # Increase `ping-timeout` parameter to ensure there is no failure in critical components like `tapdisk`. + # In fact a missed DRBD ACK packet causes EIO errors on `read/write` calls and completely blocks processes. + self._linstor.set_ping_timeout(self.uuid, 300) + self._linstor.mark_volume_as_persistent(self.uuid) except util.CommandException as e: failed = True diff --git a/drivers/linstorvolumemanager.py b/drivers/linstorvolumemanager.py index 50900d99..97815734 100755 --- a/drivers/linstorvolumemanager.py +++ b/drivers/linstorvolumemanager.py @@ -889,6 +889,23 @@ def set_auto_promote_timeout(self, volume_uuid, timeout): .format(volume_uuid, error_str) ) + def set_ping_timeout(self, volume_uuid, timeout): + """ + Set the response time to answer a DRBD ping packet. + :param str volume_uuid: The volume uuid to modify. + """ + + volume_name = self.get_volume_name(volume_uuid) + result = self._linstor.resource_dfn_modify(volume_name, { + 'DrbdOptions/Net/ping-timeout': timeout + }) + error_str = self._get_error_str(result) + if error_str: + raise LinstorVolumeManagerError( + 'Could not change the ping timeout of `{}`: {}' + .format(volume_uuid, error_str) + ) + def get_volume_info(self, volume_uuid): """ Get the volume info of a particular volume.