From 5f520cbc04f7490cf18e19a69cb8227bde1557c1 Mon Sep 17 00:00:00 2001 From: Aishwarya-Hebbar Date: Mon, 2 Sep 2024 17:47:09 +0530 Subject: [PATCH] vsan stretch automation for TKG, WCP and VMService VMs --- tests/e2e/csi_static_provisioning_basic.go | 1 - tests/e2e/e2e_common.go | 68 +- tests/e2e/hci_mesh_rwx_disruptive.go | 8 +- ...i_mesh_rwx_singlevc_topology_disruptive.go | 2 +- tests/e2e/rwx_topology_utils.go | 16 +- tests/e2e/util.go | 12 +- tests/e2e/vm_service_vsan_stretch_cluster.go | 927 ++- tests/e2e/vmservice_utils.go | 100 +- tests/e2e/vsan_stretched_cluster.go | 6581 +++++++++-------- tests/e2e/vsan_stretched_cluster_utils.go | 84 +- 10 files changed, 4665 insertions(+), 3134 deletions(-) diff --git a/tests/e2e/csi_static_provisioning_basic.go b/tests/e2e/csi_static_provisioning_basic.go index aada0ccc37..690a767ea4 100644 --- a/tests/e2e/csi_static_provisioning_basic.go +++ b/tests/e2e/csi_static_provisioning_basic.go @@ -2430,5 +2430,4 @@ var _ = ginkgo.Describe("Basic Static Provisioning", func() { gomega.Expect(err).NotTo(gomega.HaveOccurred()) }) - }) diff --git a/tests/e2e/e2e_common.go b/tests/e2e/e2e_common.go index acab02f44e..38d3491464 100644 --- a/tests/e2e/e2e_common.go +++ b/tests/e2e/e2e_common.go @@ -277,38 +277,42 @@ customPort -> label include the testcases running on vCenter custom port label include the testcases which are no longer in execution */ const ( - flaky = "flaky" - disruptive = "disruptive" - wcp = "wcp" - tkg = "tkg" - vanilla = "vanilla" - preferential = "preferential" - vsphereConfigSecret = "vsphereConfigSecret" - snapshot = "snapshot" - stable = "stable" - newTest = "newTest" - multiVc = "multiVc" - block = "block" - file = "file" - core = "core" - hci = "hci" - p0 = "p0" - p1 = "p1" - p2 = "p2" - vsanStretch = "vsanStretch" - longRunning = "longRunning" - deprecated = "deprecated" - vmc = "vmc" - tkgsHA = "tkgsHA" - thickThin = "thickThin" - customPort = "customPort" - windows = "windows" - semiAutomated = "semiAutomated" - level2 = "level2" - level5 = "level5" - negative = "negative" - listVolume = "listVolume" - multiSvc = "multiSvc" + flaky = "flaky" + disruptive = "disruptive" + wcp = "wcp" + tkg = "tkg" + vanilla = "vanilla" + preferential = "preferential" + vsphereConfigSecret = "vsphereConfigSecret" + snapshot = "snapshot" + stable = "stable" + newTest = "newTest" + multiVc = "multiVc" + block = "block" + file = "file" + core = "core" + hci = "hci" + p0 = "p0" + p1 = "p1" + p2 = "p2" + vsanStretch = "vsanStretch" + longRunning = "longRunning" + deprecated = "deprecated" + vmc = "vmc" + tkgsHA = "tkgsHA" + thickThin = "thickThin" + customPort = "customPort" + windows = "windows" + semiAutomated = "semiAutomated" + level2 = "level2" + level5 = "level5" + negative = "negative" + listVolume = "listVolume" + multiSvc = "multiSvc" + primaryCentric = "primaryCentric" + controlPlaneOnPrimary = "controlPlaneOnPrimary" + distributed = "distributed" + vmsvc = "vmsvc" ) // The following variables are required to know cluster type to run common e2e diff --git a/tests/e2e/hci_mesh_rwx_disruptive.go b/tests/e2e/hci_mesh_rwx_disruptive.go index 1cbe55129f..e7c26f8110 100644 --- a/tests/e2e/hci_mesh_rwx_disruptive.go +++ b/tests/e2e/hci_mesh_rwx_disruptive.go @@ -677,7 +677,7 @@ var _ = ginkgo.Describe("[rwx-hci-singlevc-disruptive] RWX-Topology-HciMesh-Sing ginkgo.By("PSOD all host in remote cluster4 and when psod is triggered, create new set of rwx pvc") for i := 0; i < len(hostListCluster4); i++ { - err = psodHost(hostListCluster4[i]) + err = psodHost(hostListCluster4[i], "") gomega.Expect(err).NotTo(gomega.HaveOccurred()) if i == 0 { @@ -729,7 +729,7 @@ var _ = ginkgo.Describe("[rwx-hci-singlevc-disruptive] RWX-Topology-HciMesh-Sing ginkgo.By("PSOD again all host in remote cluster4 and perform scaleup " + "operation on deployment and statefulset") for i := 0; i < len(hostListCluster4); i++ { - err = psodHost(hostListCluster4[i]) + err = psodHost(hostListCluster4[i], "") gomega.Expect(err).NotTo(gomega.HaveOccurred()) if i == 0 { @@ -1873,7 +1873,7 @@ var _ = ginkgo.Describe("[rwx-hci-singlevc-disruptive] RWX-Topology-HciMesh-Sing ginkgo.By("PSOD all host in local cluster2 and when psod is triggered, create new set of rwx pvc") for i := 0; i < len(hostListCluster2); i++ { - err = psodHost(hostListCluster2[i]) + err = psodHost(hostListCluster2[i], "") gomega.Expect(err).NotTo(gomega.HaveOccurred()) if i == 0 { @@ -1929,7 +1929,7 @@ var _ = ginkgo.Describe("[rwx-hci-singlevc-disruptive] RWX-Topology-HciMesh-Sing ginkgo.By("PSOD all host in local cluster3 and perform scaleup " + "operation on deployment and statefulset") for i := 0; i < len(hostListCluster3); i++ { - err = psodHost(hostListCluster3[i]) + err = psodHost(hostListCluster3[i], "") gomega.Expect(err).NotTo(gomega.HaveOccurred()) if i == 0 { diff --git a/tests/e2e/no_hci_mesh_rwx_singlevc_topology_disruptive.go b/tests/e2e/no_hci_mesh_rwx_singlevc_topology_disruptive.go index 33ce89e20a..890261ce99 100644 --- a/tests/e2e/no_hci_mesh_rwx_singlevc_topology_disruptive.go +++ b/tests/e2e/no_hci_mesh_rwx_singlevc_topology_disruptive.go @@ -752,7 +752,7 @@ var _ = ginkgo.Describe("[rwx-nohci-singlevc-disruptive] RWX-Topology-NoHciMesh- ginkgo.By("PSOD all host") for i := 0; i < len(hostList); i++ { - err = psodHost(hostList[i]) + err = psodHost(hostList[i], "") gomega.Expect(err).NotTo(gomega.HaveOccurred()) if i == 2 { diff --git a/tests/e2e/rwx_topology_utils.go b/tests/e2e/rwx_topology_utils.go index 878ee768a9..6f52b16f56 100644 --- a/tests/e2e/rwx_topology_utils.go +++ b/tests/e2e/rwx_topology_utils.go @@ -893,18 +893,24 @@ func verifyK8sNodeStatusAfterSiteRecovery(client clientset.Interface, ctx contex } /* This util will perform psod operation on a host */ -func psodHost(hostIP string) error { +func psodHost(hostIP string, psodTimeOut string) error { ginkgo.By("PSOD") - sshCmd := fmt.Sprintf("vsish -e set /config/Misc/intOpts/BlueScreenTimeout %s", psodTime) - op, err := runCommandOnESX("root", hostIP, sshCmd) + var timeout string + if psodTimeOut != "" { + timeout = psodTimeOut + } else { + timeout = psodTime + } + sshCmd := fmt.Sprintf("vsish -e set /config/Misc/intOpts/BlueScreenTimeout %s", timeout) + op, err := runCommandOnESX(rootUser, hostIP, sshCmd) framework.Logf(op) if err != nil { return fmt.Errorf("failed to set BlueScreenTimeout: %w", err) } ginkgo.By("Injecting PSOD") - psodCmd := "vsish -e set /reliability/crashMe/Panic 1" - op, err = runCommandOnESX("root", hostIP, psodCmd) + psodCmd := "vsish -e set /reliability/crashMe/Panic 1; exit" + op, err = runCommandOnESX(rootUser, hostIP, psodCmd) framework.Logf(op) if err != nil { return fmt.Errorf("failed to inject PSOD: %w", err) diff --git a/tests/e2e/util.go b/tests/e2e/util.go index c9a6928b01..c31fa4278f 100644 --- a/tests/e2e/util.go +++ b/tests/e2e/util.go @@ -3621,18 +3621,8 @@ func psodHostWithPv(ctx context.Context, vs *vSphere, pvName string) string { framework.Logf("hostIP %v", hostIP) gomega.Expect(hostIP).NotTo(gomega.BeEmpty()) - ginkgo.By("PSOD") - sshCmd := fmt.Sprintf("vsish -e set /config/Misc/intOpts/BlueScreenTimeout %s", psodTime) - op, err := runCommandOnESX("root", hostIP, sshCmd) - framework.Logf(op) + err := psodHost(hostIP, "") gomega.Expect(err).NotTo(gomega.HaveOccurred()) - - ginkgo.By("Injecting PSOD ") - psodCmd := "vsish -e set /reliability/crashMe/Panic 1" - op, err = runCommandOnESX("root", hostIP, psodCmd) - framework.Logf(op) - gomega.Expect(err).ShouldNot(gomega.HaveOccurred()) - return hostIP } diff --git a/tests/e2e/vm_service_vsan_stretch_cluster.go b/tests/e2e/vm_service_vsan_stretch_cluster.go index 928a41978f..53546f52a0 100644 --- a/tests/e2e/vm_service_vsan_stretch_cluster.go +++ b/tests/e2e/vm_service_vsan_stretch_cluster.go @@ -21,13 +21,15 @@ import ( "fmt" "os" "strings" + "sync" "time" - "github.com/onsi/ginkgo/v2" + ginkgo "github.com/onsi/ginkgo/v2" "github.com/onsi/gomega" vmopv1 "github.com/vmware-tanzu/vm-operator/api/v1alpha1" v1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" clientset "k8s.io/client-go/kubernetes" @@ -120,7 +122,7 @@ var _ bool = ginkgo.Describe("[vsan-stretch-vmsvc] vm service with csi vol tests vmImageName := GetAndExpectStringEnvVar(envVmsvcVmImageName) framework.Logf("Waiting for virtual machine image list to be available in namespace '%s' for image '%s'", namespace, vmImageName) - vmi = waitNGetVmiForImageName(ctx, vmopC, namespace, vmImageName) + vmi = waitNGetVmiForImageName(ctx, vmopC, vmImageName) gomega.Expect(vmi).NotTo(gomega.BeEmpty()) }) @@ -144,7 +146,7 @@ var _ bool = ginkgo.Describe("[vsan-stretch-vmsvc] vm service with csi vol tests /* VMService - primary site down Steps: - 1. Create a few PVCs using the storageclass as mentioned in testbed structure. + 1. Create a few PVCs using the storageclass as mentioned in testbed structure. 2. Create a VMservice VM from each PVC created in step2. 3. Verify all PVC's metadata on CNS. 4. Once the VMs are up verify that the volume is accessible inside the VM. @@ -158,142 +160,871 @@ var _ bool = ginkgo.Describe("[vsan-stretch-vmsvc] vm service with csi vol tests 11. Bring primary site up and wait for testbed to be back to normal. 12. Delete all objects created in this test. */ - ginkgo.It("VMService - primary site down", func() { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - var pvcCount int = 10 - var err error + ginkgo.It("VMService - primary site down", + ginkgo.Label(p0, vmsvc, vsanStretch, block, wcp), func() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + var pvcCount int = 10 + var err error - ginkgo.By("Creating StorageClass") + ginkgo.By("Creating StorageClass") - sc, err := client.StorageV1().StorageClasses().Get(ctx, storageClassName, metav1.GetOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + sc, err := client.StorageV1().StorageClasses().Get(ctx, storageClassName, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) - ginkgo.By("Create multiple PVCs") - pvclaimsList := createMultiplePVCsInParallel(ctx, client, namespace, sc, pvcCount, nil) + ginkgo.By("Create multiple PVCs") + pvclaimsList := createMultiplePVCsInParallel(ctx, client, namespace, sc, pvcCount, nil) - ginkgo.By("Waiting for all claims to be in bound state") - pvs, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaimsList, pollTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Waiting for all claims to be in bound state") + pvs, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaimsList, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) - defer func() { - for i, pvc := range pvclaimsList { - ginkgo.By("Delete PVCs") - err = fpv.DeletePersistentVolumeClaim(ctx, client, pvc.Name, namespace) + defer func() { + for i, pvc := range pvclaimsList { + ginkgo.By("Delete PVCs") + err = fpv.DeletePersistentVolumeClaim(ctx, client, pvc.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Waiting for CNS volumes to be deleted") + volHandle := pvs[i].Spec.CSI.VolumeHandle + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + }() + + ginkgo.By("Creating VM bootstrap data") + secretName := createBootstrapSecretForVmsvcVms(ctx, client, namespace) + defer func() { + ginkgo.By("Deleting VM bootstrap data") + err := client.CoreV1().Secrets(namespace).Delete(ctx, secretName, *metav1.NewDeleteOptions(0)) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - ginkgo.By("Waiting for CNS volumes to be deleted") - volHandle := pvs[i].Spec.CSI.VolumeHandle - gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) - err = e2eVSphere.waitForCNSVolumeToBeDeleted(volHandle) + }() + + ginkgo.By("Creating VM") + vms := createVMServiceVmWithMultiplePvcs( + ctx, vmopC, namespace, vmClass, pvclaimsList, vmi, storageClassName, secretName) + defer func() { + for _, vm := range vms { + ginkgo.By("Deleting VM") + err = vmopC.Delete(ctx, &vmopv1.VirtualMachine{ObjectMeta: metav1.ObjectMeta{ + Name: vm.Name, + Namespace: namespace, + }}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + }() + + ginkgo.By("Creates a loadbalancing service for ssh with each VM" + + "and waits for VM IP to come up to come up and verify PVCs are accessible in the VM") + createVMServiceandWaitForVMtoGetIP(ctx, vmopC, cnsopC, namespace, vms, pvclaimsList, true, true) + + csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Bring down the primary site") + siteFailover(ctx, true) + + defer func() { + ginkgo.By("Bring up the primary site before terminating the test") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + siteRestore(true) + fds.hostsDown = nil + } + }() + + ginkgo.By("Wait for k8s cluster to be healthy") + if vanillaCluster { + wait4AllK8sNodesToBeUp(ctx, client, nodeList) + } + if guestCluster { + err = waitForAllNodes2BeReady(ctx, client) gomega.Expect(err).NotTo(gomega.HaveOccurred()) } - }() - ginkgo.By("Creating VM bootstrap data") - secretName := createBootstrapSecretForVmsvcVms(ctx, client, namespace) - defer func() { - ginkgo.By("Deleting VM bootstrap data") - err := client.CoreV1().Secrets(namespace).Delete(ctx, secretName, *metav1.NewDeleteOptions(0)) + time.Sleep(5 * time.Minute) + // Check if csi pods are running fine after site failure + ginkgo.By("Check if csi pods are running fine after site failure") + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout*2) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Waiting for all claims to be in bound state") + pvs, err = fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaimsList, pollTimeout) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }() - ginkgo.By("Creating VM") - vms := createVMServiceVmWithMultiplePvcs( - ctx, vmopC, namespace, vmClass, pvclaimsList, vmi, storageClassName, secretName) - defer func() { for _, vm := range vms { - ginkgo.By("Deleting VM") - err = vmopC.Delete(ctx, &vmopv1.VirtualMachine{ObjectMeta: metav1.ObjectMeta{ - Name: vm.Name, - Namespace: namespace, - }}) + _, err := wait4Vm2ReachPowerStateInSpec(ctx, vmopC, vm) gomega.Expect(err).NotTo(gomega.HaveOccurred()) } - }() - ginkgo.By("Creating loadbalancing service for ssh with the VM") - for _, vm := range vms { - vmlbsvc := createService4Vm(ctx, vmopC, namespace, vm.Name) + ginkgo.By("Perform volume and application lifecycle actions") + performVolumeLifecycleActionForVmServiceVM(ctx, client, vmopC, cnsopC, vmClass, namespace, vmi, sc, secretName) + + ginkgo.By("Bring up the primary site") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + siteRestore(true) + fds.hostsDown = nil + } + + ginkgo.By("Wait for k8s cluster to be healthy") + // wait for the VMs to move back + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + }) + + /* + Secondary site down + Steps: + 1. Create a few PVCs using the storageclass as mentioned in testbed structure. + 2. Create a VMservice VM from each PVC created in step2. + 3. Verify all PVC's metadata on CNS. + 4. Once the VMs are up verify that the volume is accessible inside the VM. + 5. Write data on volumes created. + 6. Bring down the secondary site by powering off the hosts in secondary site. + 7. Verify that the supervisor cluster should be in running and ready state after site failover. + 8. Verify that all the k8s constructs created in the test are running fine. + 9. Perform volume lifecycle actions which should work fine. + 10.Verify the data written in step 5. + 11.Bring secondary site up and wait for testbed to be back to normal. + 12.Delete all objects created in this test. + */ + ginkgo.It("VMService - secondary site down", + ginkgo.Label(p0, vmsvc, vsanStretch, block, wcp), func() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + var pvcCount int = 10 + var err error + + ginkgo.By("Get StorageClass for volume creation") + + sc, err := client.StorageV1().StorageClasses().Get(ctx, storageClassName, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Create multiple PVCs") + pvclaimsList := createMultiplePVCsInParallel(ctx, client, namespace, sc, pvcCount, nil) + + ginkgo.By("Waiting for all claims to be in bound state") + pvs, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaimsList, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + defer func() { - ginkgo.By("Deleting loadbalancing service for ssh with the VM") - err = vmopC.Delete(ctx, &vmopv1.VirtualMachineService{ObjectMeta: metav1.ObjectMeta{ - Name: vmlbsvc.Name, - Namespace: namespace, - }}) + for i, pvc := range pvclaimsList { + ginkgo.By("Delete PVCs") + err = fpv.DeletePersistentVolumeClaim(ctx, client, pvc.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Waiting for CNS volumes to be deleted") + volHandle := pvs[i].Spec.CSI.VolumeHandle + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + }() + + ginkgo.By("Creating VM bootstrap data") + secretName := createBootstrapSecretForVmsvcVms(ctx, client, namespace) + defer func() { + ginkgo.By("Deleting VM bootstrap data") + err := client.CoreV1().Secrets(namespace).Delete(ctx, secretName, *metav1.NewDeleteOptions(0)) gomega.Expect(err).NotTo(gomega.HaveOccurred()) }() - } - ginkgo.By("Wait for VM to come up and get an IP") - for j, vm := range vms { - vmIp, err := waitNgetVmsvcVmIp(ctx, vmopC, namespace, vm.Name) + ginkgo.By("Creating VM") + vms := createVMServiceVmWithMultiplePvcs( + ctx, vmopC, namespace, vmClass, pvclaimsList, vmi, storageClassName, secretName) + defer func() { + for _, vm := range vms { + ginkgo.By("Deleting VM") + err = vmopC.Delete(ctx, &vmopv1.VirtualMachine{ObjectMeta: metav1.ObjectMeta{ + Name: vm.Name, + Namespace: namespace, + }}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + }() + + ginkgo.By("Creates a loadbalancing service for ssh with each VM" + + "and waits for VM IP to come up to come up and verify PVCs are accessible in the VM") + createVMServiceandWaitForVMtoGetIP(ctx, vmopC, cnsopC, namespace, vms, pvclaimsList, true, true) + + csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - ginkgo.By("Wait and verify PVCs are attached to the VM") - gomega.Expect(waitNverifyPvcsAreAttachedToVmsvcVm(ctx, vmopC, cnsopC, vm, - []*v1.PersistentVolumeClaim{pvclaimsList[j]})).NotTo(gomega.HaveOccurred()) + ginkgo.By("Bring down the secondary site") + siteFailover(ctx, false) + + defer func() { + ginkgo.By("Bring up the secondary site before terminating the test") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + siteRestore(false) + fds.hostsDown = nil + } + }() + + ginkgo.By("Wait for k8s cluster to be healthy") + if vanillaCluster { + wait4AllK8sNodesToBeUp(ctx, client, nodeList) + } + if guestCluster { + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + time.Sleep(5 * time.Minute) + // Check if csi pods are running fine after site failure + ginkgo.By("Check if csi pods are running fine after site failure") + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout*2) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) - ginkgo.By("Verify PVCs are accessible to the VM") - ginkgo.By("Write some IO to the CSI volumes and read it back from them and verify the data integrity") - vm, err = getVmsvcVM(ctx, vmopC, vm.Namespace, vm.Name) // refresh vm info + ginkgo.By("Waiting for all claims to be in bound state") + pvs, err = fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaimsList, pollTimeout) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - for i, vol := range vm.Status.Volumes { - volFolder := formatNVerifyPvcIsAccessible(vol.DiskUuid, i+1, vmIp) - verifyDataIntegrityOnVmDisk(vmIp, volFolder) + + for _, vm := range vms { + _, err := wait4Vm2ReachPowerStateInSpec(ctx, vmopC, vm) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) } - } - csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Verify volume lifecycle actions when there is a fault induced") + performVolumeLifecycleActionForVmServiceVM(ctx, client, vmopC, cnsopC, vmClass, namespace, vmi, sc, secretName) - ginkgo.By("Bring down the primary site") - siteFailover(ctx, true) + ginkgo.By("Bring up the secondary site") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + siteRestore(false) + fds.hostsDown = nil + } + + ginkgo.By("Wait for k8s cluster to be healthy") + // wait for the VMs to move back + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + }) + + /* + VMService VM creation while primary site goes down¯ + Steps: + 1. Create 10 PVCS using the storageclass as mentioned in testbed structure and verify that it goes to bound state. + 2. Create VMService VM with each PVC created in step1. + 3. While VMService VM creation is going on, bring down the primary site by powering off the hosts in primary site in parallel. + 4. Verify that the supervisor cluster should be in running and ready state after site failover. + 5. Verify that all the PVCs created in step 2 are running fine. + 6. Perform volume lifecycle actions which should work fine. + 7. Bring primary site up and wait for testbed to be back to normal. + 8. Delete all objects created in the test. + */ + ginkgo.It("VMService VM creation while primary site goes down", + ginkgo.Label(p0, vmsvc, vsanStretch, block, wcp), func() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + var pvcCount int = 9 + var vmCount = 9 + var err error + var vms []*vmopv1.VirtualMachine + + ginkgo.By("Creating StorageClass") + sc, err := client.StorageV1().StorageClasses().Get(ctx, storageClassName, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Create multiple PVCs") + pvclaimsList := createMultiplePVCsInParallel(ctx, client, namespace, sc, pvcCount, nil) + + ginkgo.By("Waiting for all claims to be in bound state") + pvs, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaimsList, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + defer func() { + for i, pvc := range pvclaimsList { + ginkgo.By("Delete PVCs") + err = fpv.DeletePersistentVolumeClaim(ctx, client, pvc.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Waiting for CNS volumes to be deleted") + volHandle := pvs[i].Spec.CSI.VolumeHandle + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + }() - defer func() { - ginkgo.By("Bring up the primary site before terminating the test") + ginkgo.By("Creating VM bootstrap data") + secretName := createBootstrapSecretForVmsvcVms(ctx, client, namespace) + defer func() { + ginkgo.By("Deleting VM bootstrap data") + err := client.CoreV1().Secrets(namespace).Delete(ctx, secretName, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + + csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ch := make(chan *vmopv1.VirtualMachine) + var wg sync.WaitGroup + var lock sync.Mutex + ginkgo.By("Creating VM in parallel to site failure") + wg.Add(2) + go createVMServiceVmInParallel(ctx, vmopC, namespace, vmClass, pvclaimsList, + vmi, storageClassName, secretName, vmCount, ch, &wg, &lock) + go func() { + for v := range ch { + vms = append(vms, v) + } + }() + go siteFailureInParallel(ctx, true, &wg) + wg.Wait() + close(ch) + + defer func() { + ginkgo.By("Bring up the primary site before terminating the test") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + siteRestore(true) + fds.hostsDown = nil + } + }() + + defer func() { + for _, vm := range vms { + ginkgo.By("Deleting VM") + err = vmopC.Delete(ctx, &vmopv1.VirtualMachine{ObjectMeta: metav1.ObjectMeta{ + Name: vm.Name, + Namespace: namespace, + }}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + }() + + ginkgo.By("Wait for k8s cluster to be healthy") + if vanillaCluster { + wait4AllK8sNodesToBeUp(ctx, client, nodeList) + } + if guestCluster { + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + // Check if csi pods are running fine after site failure + ginkgo.By("Check if csi pods are running fine after site failure") + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout*2) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Waiting for all claims to be in bound state") + pvs, err = fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaimsList, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Creates a loadbalancing service for ssh with each VM" + + "and waits for VM IP to come up to come up and verify PVCs are accessible in the VM") + createVMServiceandWaitForVMtoGetIP(ctx, vmopC, cnsopC, namespace, vms, pvclaimsList, true, true) + + ginkgo.By("Verify volume lifecycle actions when there is a fault induced") + performVolumeLifecycleActionForVmServiceVM(ctx, client, vmopC, cnsopC, vmClass, namespace, vmi, sc, secretName) + + ginkgo.By("Bring up the primary site") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + siteRestore(true) + fds.hostsDown = nil + } + + ginkgo.By("Wait for k8s cluster to be healthy") + // wait for the VMs to move back + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + }) + + /* + VMService VM deletion while secondary site goes down + Steps: + + 1. Create 10 PVCS using the storageclass as mentioned in testbed structure + and verify they go into bound state. + 2. Create VMService VM with each PVC created in step1. + 3. Verify all PVC's metadata on CNS. + 4. Once the VMs are up verify that the volume is accessible inside the VM. + 5. Delete all the VMs created in step2. + 6. While VMService VM deletion is going on, + bring down the secondary site by powering off the hosts in secondary site in parallel. + 7. Verify that the supervisor cluster should be in running and ready state after site failover. + 8. Verify all the VMservice vms created in step2 are deleted successfully. + 9. Perform volume lifecycle actions which should work fine. + 10.Bring secondary site up and wait for testbed to be back to normal. + 11.Delete all objects created in this test. + */ + ginkgo.It("VMService VM deletion while secondary site goes down", + ginkgo.Label(p0, vmsvc, vsanStretch, block, wcp), func() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + var pvcCount int = 10 + var err error + + ginkgo.By("Creating StorageClass") + + sc, err := client.StorageV1().StorageClasses().Get(ctx, storageClassName, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Create multiple PVCs") + pvclaimsList := createMultiplePVCsInParallel(ctx, client, namespace, sc, pvcCount, nil) + + ginkgo.By("Waiting for all claims to be in bound state") + pvs, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaimsList, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + defer func() { + for i, pvc := range pvclaimsList { + ginkgo.By("Delete PVCs") + err = fpv.DeletePersistentVolumeClaim(ctx, client, pvc.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Waiting for CNS volumes to be deleted") + volHandle := pvs[i].Spec.CSI.VolumeHandle + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + }() + + ginkgo.By("Creating VM bootstrap data") + secretName := createBootstrapSecretForVmsvcVms(ctx, client, namespace) + defer func() { + ginkgo.By("Deleting VM bootstrap data") + err := client.CoreV1().Secrets(namespace).Delete(ctx, secretName, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + + csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Creating VM") + vms := createVMServiceVmWithMultiplePvcs( + ctx, vmopC, namespace, vmClass, pvclaimsList, vmi, storageClassName, secretName) + defer func() { + for _, vm := range vms { + ginkgo.By("Deleting VM") + _, err := getVmsvcVM(ctx, vmopC, namespace, vm.Name) + if !apierrors.IsNotFound(err) { + err = vmopC.Delete(ctx, &vmopv1.VirtualMachine{ObjectMeta: metav1.ObjectMeta{ + Name: vm.Name, + Namespace: namespace, + }}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + } + }() + + ginkgo.By("Creates a loadbalancing service for ssh with each VM" + + "and waits for VM IP to come up to come up and verify PVCs are accessible in the VM") + createVMServiceandWaitForVMtoGetIP(ctx, vmopC, cnsopC, namespace, vms, pvclaimsList, true, true) + + var wg sync.WaitGroup + ginkgo.By("Deleting VM in parallel to secondary site failure") + wg.Add(2) + go deleteVMServiceVmInParallel(ctx, vmopC, vms, namespace, &wg) + go siteFailureInParallel(ctx, false, &wg) + wg.Wait() + + defer func() { + ginkgo.By("Bring up the secondary site before terminating the test") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + siteRestore(false) + fds.hostsDown = nil + } + }() + + ginkgo.By("Wait for k8s cluster to be healthy") + if vanillaCluster { + wait4AllK8sNodesToBeUp(ctx, client, nodeList) + } + if guestCluster { + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + time.Sleep(5 * time.Minute) + // Check if csi pods are running fine after site failure + ginkgo.By("Check if csi pods are running fine after site failure") + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout*2) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Waiting for all claims to be in bound state") + pvs, err = fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaimsList, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Verify all the VMservice vms created before " + + "secondary site failure are deleted successfully") + for _, vm := range vms { + _, err := getVmsvcVM(ctx, vmopC, namespace, vm.Name) + gomega.Expect(err).To(gomega.HaveOccurred()) + } + + ginkgo.By("Bring up the secondary site") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + siteRestore(false) + fds.hostsDown = nil + } + + ginkgo.By("Wait for k8s cluster to be healthy") + // wait for the VMs to move back + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + }) + + /* + PSOD hosts on secondary site + Steps: + 1. Create a few PVCs using the storageclass as mentioned in testbed structure. + 2. Verify all PVC's metadata on CNS. + 3. Create a VMservice VM from each PVC created in step2. + 4. Write data on volumes created. + 5. While VMService VM creation is going on,PSOD all hosts + in secondary site in parallel. + 6. Verify that the supervisor cluster should be in running + and ready state after site failover. + 7. Verify that all the k8s constructs created in step 2 are running fine. + 8. Perform volume lifecycle actions which should work fine. + 9. Verify the data written in step 3. + 10.Wait for psod timeout to be over and wait for testbed to be back to normal. + 11.Delete all objects created in this test. + */ + ginkgo.It("VMService - psod hosts on secondary site", + ginkgo.Label(p0, vmsvc, vsanStretch, block, wcp), func() { + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + var vms []*vmopv1.VirtualMachine + var svcCsipods, csipods *v1.PodList + + ginkgo.By("Creating StorageClass") + sc, err := client.StorageV1().StorageClasses().Get(ctx, storageClassName, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Create multiple PVCs") + pvclaimsList := createMultiplePVCsInParallel(ctx, client, namespace, sc, 10, nil) + + ginkgo.By("Waiting for all claims to be in bound state") + pvs, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaimsList, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + defer func() { + for i, pvc := range pvclaimsList { + ginkgo.By("Delete PVCs") + err = fpv.DeletePersistentVolumeClaim(ctx, client, pvc.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Waiting for CNS volumes to be deleted") + volHandle := pvs[i].Spec.CSI.VolumeHandle + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + }() + + ginkgo.By("Creating VM bootstrap data") + secretName := createBootstrapSecretForVmsvcVms(ctx, client, namespace) + defer func() { + ginkgo.By("Deleting VM bootstrap data") + err := client.CoreV1().Secrets(namespace).Delete(ctx, secretName, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + + csipods, err = client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + if guestCluster { + svcCsipods, err = svcClient.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + ch := make(chan *vmopv1.VirtualMachine) + var wg sync.WaitGroup + var lock sync.Mutex + ginkgo.By("Creating VM in parallel to site failure") + wg.Add(2) + go createVMServiceVmInParallel(ctx, vmopC, namespace, vmClass, + pvclaimsList, vmi, storageClassName, secretName, 10, ch, &wg, &lock) + go func() { + for v := range ch { + vms = append(vms, v) + } + }() + go psodHostsInParallel(true, "600", &wg) + wg.Wait() + close(ch) + + if vanillaCluster { + wait4AllK8sNodesToBeUp(ctx, client, nodeList) + } + if vanillaCluster || guestCluster { + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + time.Sleep(5 * time.Minute) + + if guestCluster { + ginkgo.By("Check for nodes to be in Ready state in supervisor") + err = fpod.WaitForPodsRunningReady(ctx, svcClient, csiNs, int32(svcCsipods.Size()), 0, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + ginkgo.By("Check if csi pods are running fine after site recovery") + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Waiting for all claims to be in bound state") + pvs, err = fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaimsList, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Creates a loadbalancing service for ssh with each VM" + + "and waits for VM IP to come up to come up and verify PVCs are accessible in the VM") + createVMServiceandWaitForVMtoGetIP(ctx, vmopC, cnsopC, namespace, vms, pvclaimsList, true, true) + + ginkgo.By("Verify volume lifecycle actions when there is a fault induced") + performVolumeLifecycleActionForVmServiceVM(ctx, client, vmopC, cnsopC, + vmClass, namespace, vmi, sc, secretName) + + ginkgo.By("Bring up the primary site") if len(fds.hostsDown) > 0 && fds.hostsDown != nil { siteRestore(true) fds.hostsDown = nil } - }() - ginkgo.By("Wait for k8s cluster to be healthy") - if vanillaCluster { wait4AllK8sNodesToBeUp(ctx, client, nodeList) - } - if guestCluster { err = waitForAllNodes2BeReady(ctx, client) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } - time.Sleep(5 * time.Minute) - // Check if csi pods are running fine after site failure - ginkgo.By("Check if csi pods are running fine after site failure") - err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout*2) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }) - ginkgo.By("Waiting for all claims to be in bound state") - pvs, err = fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaimsList, pollTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + /* + VMService - witness failure + Steps: + 1. Configure a vanilla multi-master K8s cluster with inter and intra site replication + 2. Create a statefulset, deployment with volumes from the stretched datastore + 3. Bring down the primary site + 4. Verify that the VMs hosted by esx servers are brought up on the other site + 5. Verify that the k8s cluster is healthy and all the k8s constructs created in step 2 are running and volume + and application lifecycle actions work fine + 6. Bring primary site up and wait for testbed to be back to normal + 7. Delete all objects created in step 2 and 5 + */ + ginkgo.It("VMService - witness failure", + ginkgo.Label(p0, vmsvc, vsanStretch, block, wcp), func() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + var pvcCount int = 10 + var err error - for _, vm := range vms { - _, err := wait4Vm2ReachPowerStateInSpec(ctx, vmopC, vm) + ginkgo.By("Creating StorageClass") + sc, err := client.StorageV1().StorageClasses().Get(ctx, storageClassName, metav1.GetOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } - ginkgo.By("Perform volume and application lifecycle actions") - performVolumeLifecycleActionForVmServiceVM(ctx, client, vmopC, cnsopC, vmClass, namespace, vmi, sc, secretName) + csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) - ginkgo.By("Bring up the primary site") - if len(fds.hostsDown) > 0 && fds.hostsDown != nil { - siteRestore(true) - fds.hostsDown = nil - } + ginkgo.By("Wait for k8s cluster to be healthy") + if vanillaCluster { + wait4AllK8sNodesToBeUp(ctx, client, nodeList) + } + if guestCluster { + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } - ginkgo.By("Wait for k8s cluster to be healthy") - // wait for the VMs to move back - err = waitForAllNodes2BeReady(ctx, client) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Bring down witness host") + toggleWitnessPowerState(ctx, true) + defer func() { + ginkgo.By("Bring up the witness host before terminating the test") + if fds.witnessDown != "" { + toggleWitnessPowerState(ctx, false) + } + }() - }) + // Check if csi pods are running fine after site failure + ginkgo.By("Check if csi pods are running fine after site failure") + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout*2) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Create multiple PVCs") + pvclaimsList := createMultiplePVCsInParallel(ctx, client, namespace, sc, pvcCount, nil) + + ginkgo.By("Waiting for all claims to be in bound state") + pvs, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaimsList, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + defer func() { + for i, pvc := range pvclaimsList { + ginkgo.By("Delete PVCs") + err = fpv.DeletePersistentVolumeClaim(ctx, client, pvc.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Waiting for CNS volumes to be deleted") + volHandle := pvs[i].Spec.CSI.VolumeHandle + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + }() + + ginkgo.By("Creating VM bootstrap data") + secretName := createBootstrapSecretForVmsvcVms(ctx, client, namespace) + defer func() { + ginkgo.By("Deleting VM bootstrap data") + err := client.CoreV1().Secrets(namespace).Delete(ctx, + secretName, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + + ginkgo.By("Creating VM") + vms := createVMServiceVmWithMultiplePvcs( + ctx, vmopC, namespace, vmClass, pvclaimsList, vmi, storageClassName, secretName) + defer func() { + for _, vm := range vms { + ginkgo.By("Deleting VM") + err = vmopC.Delete(ctx, &vmopv1.VirtualMachine{ObjectMeta: metav1.ObjectMeta{ + Name: vm.Name, + Namespace: namespace, + }}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + }() + + ginkgo.By("Creates a loadbalancing service for ssh with each VM" + + "and waits for VM IP to come up to come up and verify PVCs are accessible in the VM") + createVMServiceandWaitForVMtoGetIP(ctx, vmopC, cnsopC, namespace, vms, pvclaimsList, true, true) + + ginkgo.By("Check storage compliance") + comp := checkVmStorageCompliance(client, storagePolicyName) + if comp { + framework.Failf("Expected VM and storage compliance to be false but found true") + } + + ginkgo.By("Bring up witness host") + if fds.witnessDown != "" { + toggleWitnessPowerState(ctx, false) + } + + time.Sleep(5 * time.Minute) + ginkgo.By("Check storage compliance") + comp = checkVmStorageCompliance(client, storagePolicyName) + if !comp { + framework.Failf("Expected VM and storage compliance to be true but found false") + } + + }) + + /* + Primary site network isolation + Steps: + 1. Create a few PVCs using the storageclass as mentioned in testbed structure. + 2. Verify all PVC's metadata on CNS. + 3. Create a VMservice VM from each PVC created in step2. + 4. Write data on volumes created. + 5. While VMService VM creation is going on, isolate the primary site from the witness and the secondary site + such that both witness and secondary site can't talk to primary site. VC will have access to both secondary site and primary site. + 6. Verify that the supervisor cluster should be in running and ready state after site failover. + 7. Verify all the k8s constructs created in step 2 are running and volume and application lifecycle actions work fine + 8. Once the VMs are up verify that the volume is accessible inside the VM. + 9. Perform volume lifecycle actions which should work fine. + 10. Verify the CNS metadata entries. + 11. Re-establish primary site network and wait for testbed to be back to normal + 12. Delete all objects created in this test. + */ + ginkgo.It("VMService - Primary site network isolation", + ginkgo.Label(p0, vmsvc, vsanStretch, block, wcp), func() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + var pvcCount int = 10 + var err error + + ginkgo.By("Creating StorageClass") + + sc, err := client.StorageV1().StorageClasses().Get(ctx, storageClassName, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Create multiple PVCs") + pvclaimsList := createMultiplePVCsInParallel(ctx, client, namespace, sc, pvcCount, nil) + + ginkgo.By("Waiting for all claims to be in bound state") + pvs, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaimsList, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + defer func() { + for i, pvc := range pvclaimsList { + ginkgo.By("Delete PVCs") + err = fpv.DeletePersistentVolumeClaim(ctx, client, pvc.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Waiting for CNS volumes to be deleted") + volHandle := pvs[i].Spec.CSI.VolumeHandle + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + }() + + ginkgo.By("Creating VM bootstrap data") + secretName := createBootstrapSecretForVmsvcVms(ctx, client, namespace) + defer func() { + ginkgo.By("Deleting VM bootstrap data") + err := client.CoreV1().Secrets(namespace).Delete(ctx, secretName, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + + ginkgo.By("Creating VM") + vms := createVMServiceVmWithMultiplePvcs( + ctx, vmopC, namespace, vmClass, pvclaimsList, vmi, storageClassName, secretName) + defer func() { + for _, vm := range vms { + ginkgo.By("Deleting VM") + err = vmopC.Delete(ctx, &vmopv1.VirtualMachine{ObjectMeta: metav1.ObjectMeta{ + Name: vm.Name, + Namespace: namespace, + }}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + }() + + ginkgo.By("Creates a loadbalancing service for ssh with each VM" + + "and waits for VM IP to come up to come up and verify PVCs are accessible in the VM") + createVMServiceandWaitForVMtoGetIP(ctx, vmopC, cnsopC, namespace, vms, pvclaimsList, true, true) + + csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + // Cause a network failure on primary site + ginkgo.By("Isolate secondary site from witness and primary site") + siteNetworkFailure(false, false) + defer func() { + ginkgo.By("Bring up the primary site before terminating the test") + siteNetworkFailure(false, true) + }() + + ginkgo.By("Wait for k8s cluster to be healthy") + if vanillaCluster { + wait4AllK8sNodesToBeUp(ctx, client, nodeList) + } + if guestCluster { + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + // Check if csi pods are running fine after site failure + ginkgo.By("Check if csi pods are running fine after site failure") + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout*2) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Waiting for all claims to be in bound state") + pvs, err = fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaimsList, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + for _, vm := range vms { + _, err := wait4Vm2ReachPowerStateInSpec(ctx, vmopC, vm) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + ginkgo.By("Verify volume lifecycle actions when there is a fault induced") + performVolumeLifecycleActionForVmServiceVM(ctx, client, vmopC, cnsopC, + vmClass, namespace, vmi, sc, secretName) + + ginkgo.By("Bring up the primary site") + siteNetworkFailure(false, true) + + ginkgo.By("Wait for k8s cluster to be healthy") + // wait for the VMs to move back + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + }) }) diff --git a/tests/e2e/vmservice_utils.go b/tests/e2e/vmservice_utils.go index 062518bafe..b3061d3ddb 100644 --- a/tests/e2e/vmservice_utils.go +++ b/tests/e2e/vmservice_utils.go @@ -28,6 +28,7 @@ import ( "reflect" "strconv" "strings" + "sync" "time" "github.com/onsi/ginkgo/v2" @@ -251,7 +252,7 @@ func invokeVCRestAPIDeleteRequest(vcRestSessionId string, url string) ([]byte, i } // waitNGetVmiForImageName waits and fetches VM image CR for given image name in the specified namespace -func waitNGetVmiForImageName(ctx context.Context, c ctlrclient.Client, namespace string, imageName string) string { +func waitNGetVmiForImageName(ctx context.Context, c ctlrclient.Client, imageName string) string { vmi := "" err := wait.PollUntilContextTimeout(ctx, poll*5, pollTimeout, true, func(ctx context.Context) (bool, error) { @@ -335,7 +336,7 @@ func waitNgetVmsvcVM(ctx context.Context, c ctlrclient.Client, namespace string, // waitNgetVmsvcVmIp wait and fetch the primary IP of the vm in give ns func waitNgetVmsvcVmIp(ctx context.Context, c ctlrclient.Client, namespace string, name string) (string, error) { ip := "" - err := wait.PollUntilContextTimeout(ctx, poll*10, pollTimeout*2, true, + err := wait.PollUntilContextTimeout(ctx, poll*10, pollTimeout*4, true, func(ctx context.Context) (bool, error) { vm, err := getVmsvcVM(ctx, c, namespace, name) if err != nil { @@ -906,6 +907,59 @@ func createVMServiceVmWithMultiplePvcs(ctx context.Context, c ctlrclient.Client, return vms } +// createVMServiceVmInParallel creates VMService VM concurrently +// for a given namespace with 1:1 mapping between PVC and the VMServiceVM +func createVMServiceVmInParallel(ctx context.Context, c ctlrclient.Client, namespace string, vmClass string, + pvcs []*v1.PersistentVolumeClaim, vmi string, storageClassName string, secretName string, + vmCount int, ch chan *vmopv1.VirtualMachine, wg *sync.WaitGroup, lock *sync.Mutex) { + defer wg.Done() + for i := 0; i < vmCount; i++ { + r := rand.New(rand.NewSource(time.Now().UnixNano())) + vols := []vmopv1.VirtualMachineVolume{} + vmName := fmt.Sprintf("csi-test-vm-%d", r.Intn(10000)) + + vols = append(vols, vmopv1.VirtualMachineVolume{ + Name: pvcs[i].Name, + PersistentVolumeClaim: &vmopv1.PersistentVolumeClaimVolumeSource{ + PersistentVolumeClaimVolumeSource: v1.PersistentVolumeClaimVolumeSource{ClaimName: pvcs[i].Name}, + }, + }) + + vm := vmopv1.VirtualMachine{ + ObjectMeta: metav1.ObjectMeta{Name: vmName, Namespace: namespace}, + Spec: vmopv1.VirtualMachineSpec{ + PowerState: vmopv1.VirtualMachinePoweredOn, + ImageName: vmi, + ClassName: vmClass, + StorageClass: storageClassName, + Volumes: vols, + VmMetadata: &vmopv1.VirtualMachineMetadata{Transport: cloudInitLabel, SecretName: secretName}, + }, + } + err := c.Create(ctx, &vm) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + lock.Lock() + ch <- &vm + lock.Unlock() + framework.Logf("Created VMServiceVM: %s", vmName) + } +} + +// deleteVMServiceVmInParallel deletes the VMService VMs concurrently from a given namespace +func deleteVMServiceVmInParallel(ctx context.Context, c ctlrclient.Client, vms []*vmopv1.VirtualMachine, namespace string, + wg *sync.WaitGroup) { + + defer wg.Done() + for _, vm := range vms { + err := c.Delete(ctx, &vmopv1.VirtualMachine{ObjectMeta: metav1.ObjectMeta{ + Name: vm.Name, + Namespace: namespace, + }}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } +} + // performVolumeLifecycleActionForVmServiceVM creates pvc and attaches a VMService VM to it // and waits for the workloads to be in healthy state and then deletes them func performVolumeLifecycleActionForVmServiceVM(ctx context.Context, client clientset.Interface, @@ -992,3 +1046,45 @@ func updateVmWithNewPvc(ctx context.Context, vmopC ctlrclient.Client, vmName str } return nil } + +// createVMServiceandWaitForVMtoGetIP creates a loadbalancing service for ssh with each VM +// and waits for VM IP to come up to come up and verify PVCs are accessible in the VM +func createVMServiceandWaitForVMtoGetIP(ctx context.Context, vmopC ctlrclient.Client, cnsopC ctlrclient.Client, namespace string, + vms []*vmopv1.VirtualMachine, pvclaimsList []*v1.PersistentVolumeClaim, doCreateVmSvc bool, waitForVmIp bool) { + + if doCreateVmSvc { + ginkgo.By("Creating loadbalancing service for ssh with the VM") + for _, vm := range vms { + vmlbsvc := createService4Vm(ctx, vmopC, namespace, vm.Name) + defer func() { + ginkgo.By("Deleting loadbalancing service for ssh with the VM") + err := vmopC.Delete(ctx, &vmopv1.VirtualMachineService{ObjectMeta: metav1.ObjectMeta{ + Name: vmlbsvc.Name, + Namespace: namespace, + }}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + } + } + + if waitForVmIp { + ginkgo.By("Wait for VM to come up and get an IP") + for j, vm := range vms { + vmIp, err := waitNgetVmsvcVmIp(ctx, vmopC, namespace, vm.Name) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Wait and verify PVCs are attached to the VM") + gomega.Expect(waitNverifyPvcsAreAttachedToVmsvcVm(ctx, vmopC, cnsopC, vm, + []*v1.PersistentVolumeClaim{pvclaimsList[j]})).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Verify PVCs are accessible to the VM") + ginkgo.By("Write some IO to the CSI volumes and read it back from them and verify the data integrity") + vm, err = getVmsvcVM(ctx, vmopC, vm.Namespace, vm.Name) // refresh vm info + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for i, vol := range vm.Status.Volumes { + volFolder := formatNVerifyPvcIsAccessible(vol.DiskUuid, i+1, vmIp) + verifyDataIntegrityOnVmDisk(vmIp, volFolder) + } + } + } +} diff --git a/tests/e2e/vsan_stretched_cluster.go b/tests/e2e/vsan_stretched_cluster.go index 7db6da129d..deca0c0249 100644 --- a/tests/e2e/vsan_stretched_cluster.go +++ b/tests/e2e/vsan_stretched_cluster.go @@ -39,6 +39,8 @@ import ( apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" clientset "k8s.io/client-go/kubernetes" + restclient "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" "k8s.io/kubernetes/test/e2e/framework" fdep "k8s.io/kubernetes/test/e2e/framework/deployment" fnodes "k8s.io/kubernetes/test/e2e/framework/node" @@ -46,6 +48,7 @@ import ( fpv "k8s.io/kubernetes/test/e2e/framework/pv" fss "k8s.io/kubernetes/test/e2e/framework/statefulset" admissionapi "k8s.io/pod-security-admission/api" + clientgrp "sigs.k8s.io/controller-runtime/pkg/client" cnsoperatorv1alpha1 "sigs.k8s.io/vsphere-csi-driver/v3/pkg/apis/cnsoperator" k8s "sigs.k8s.io/vsphere-csi-driver/v3/pkg/kubernetes" ) @@ -78,6 +81,7 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f nimbusGeneratedK8sVmPwd string sc *storagev1.StorageClass accessMode v1.PersistentVolumeAccessMode + err error ) ginkgo.BeforeEach(func() { @@ -100,7 +104,6 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f // TODO: verify csi pods are up if guestCluster { - if k8senv := GetAndExpectStringEnvVar("SUPERVISOR_CLUSTER_KUBE_CONFIG"); k8senv != "" { svcClient, err = createKubernetesClientFromConfig(k8senv) gomega.Expect(err).NotTo(gomega.HaveOccurred()) @@ -182,13 +185,18 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f accessMode = v1.ReadWriteOnce } + if rwxAccessMode { + accessMode = v1.ReadWriteMany + } else { + accessMode = v1.ReadWriteOnce + } }) ginkgo.AfterEach(func() { ctx, cancel := context.WithCancel(context.Background()) defer cancel() if !guestCluster { - wait4AllK8sNodesToBeUp(ctx, client, nodeList) + wait4AllK8sNodesToBeUp(nodeList) } err := waitForAllNodes2BeReady(ctx, client) gomega.Expect(err).NotTo(gomega.HaveOccurred()) @@ -244,7 +252,7 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f /* Primary site down Steps: - 1. Configure a vanilla multi-master K8s cluster with inter and intra site replication + 1. Configure a vsan stretched cluster testbed. 2. Create a statefulset, deployment with volumes from the stretched datastore 3. Bring down the primary site 4. Verify that the VMs hosted by esx servers are brought up on the other site @@ -253,7 +261,7 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f 6. Bring primary site up and wait for testbed to be back to normal 7. Delete all objects created in step 2 and 5 */ - ginkgo.It("[primary-centric][csi-vsan-stretch-wcp][csi-vsan-stretch-tkg] Primary site down", func() { + ginkgo.It("Primary site down", ginkgo.Label(p0, vsanStretch, block, vanilla, wcp, tkg, primaryCentric), func() { ctx, cancel := context.WithCancel(context.Background()) defer cancel() var stsReplicas, depReplicaCount int32 @@ -335,7 +343,7 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f ginkgo.By("Wait for k8s cluster to be healthy") if vanillaCluster { - wait4AllK8sNodesToBeUp(ctx, client, nodeList) + wait4AllK8sNodesToBeUp(nodeList) } if guestCluster || vanillaCluster { err = waitForAllNodes2BeReady(ctx, client) @@ -386,7 +394,7 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f if guestCluster { ginkgo.By("Check for nodes to be in Ready state in supervisor") - wait4AllK8sNodesToBeUp(ctx, svcClient, svcNodeList) + wait4AllK8sNodesToBeUp(nodeList) err = waitForAllNodes2BeReady(ctx, svcClient) gomega.Expect(err).NotTo(gomega.HaveOccurred()) } @@ -413,6 +421,7 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f } } + ginkgo.By("Delete all the statefulsets and deployments in namespace") scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) }) @@ -420,7 +429,7 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f /* Statefulset scale up/down while primary site goes down Steps: - 1. Configure a vanilla multi-master K8s cluster with inter and intra site replication + 1. Configure a vsan stretched cluster testbed. 2. Create two statefulset with replica count 1(sts1) and 5(sts2) respectively using a thick provision policy and wait for all replicas to be running 3. Change replica count of sts1 and sts2 to 3 @@ -432,227 +441,226 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f 8. Delete statefulsets and its pvcs created in step 2 9. Bring primary site up and wait for testbed to be back to normal */ - ginkgo.It("[primary-centric][control-plane-on-primary]"+ - "[csi-vsan-stretch-wcp][csi-vsan-stretch-tkg] Statefulset scale up/down while primary"+ - " site goes down", func() { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - ginkgo.By("Creating StorageClass for Statefulset") - var sts1Replicas, sts2Replicas, dep1ReplicaCount, dep2ReplicaCount int32 - var err error - var svcCsipods *v1.PodList - - if vanillaCluster { - ginkgo.By("CNS_TEST: Running for vanilla k8s setup") - scParameters = map[string]string{} - scParameters[scParamStoragePolicyName] = storagePolicyName - scSpec := getVSphereStorageClassSpec(defaultNginxStorageClassName, scParameters, nil, "", "", false) - sc, err = client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - defer func() { - err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }() - } else { - ginkgo.By("CNS_TEST: Running for GC setup") - sc, err = client.StorageV1().StorageClasses().Get(ctx, storagePolicyName, metav1.GetOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - - } - - ginkgo.By("Creating service") - service := CreateService(namespace, client) - defer func() { - deleteService(namespace, client, service) - }() - - ginkgo.By("Creating statefulsets sts1 with replica count 1 and sts2 with 5 and wait for all" + - "the replicas to be running") - - if rwxAccessMode { - dep1ReplicaCount = 3 - dep2ReplicaCount = 5 - } else { - dep1ReplicaCount = 1 - dep2ReplicaCount = 1 - } - sts1Replicas = 1 - sts2Replicas = 5 - statefulset1, deployment1, _ := createStsDeployment(ctx, client, namespace, sc, true, - false, sts1Replicas, "web", dep1ReplicaCount, accessMode) - statefulset2, deployment2, _ := createStsDeployment(ctx, client, namespace, sc, true, - true, sts2Replicas, "web-nginx", dep2ReplicaCount, accessMode) - ss2PodsBeforeScaleDown := fss.GetPodList(ctx, client, statefulset2) - - csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - - if guestCluster { - svcCsipods, err = svcClient.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } - - defer func() { - scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) - pvcs, err := client.CoreV1().PersistentVolumeClaims(namespace).List(ctx, metav1.ListOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - for _, claim := range pvcs.Items { - pv := getPvFromClaim(client, namespace, claim.Name) - err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) + ginkgo.It("Statefulset scale up/down while primary site goes down", + ginkgo.Label(p0, vsanStretch, block, vanilla, wcp, tkg, primaryCentric, controlPlaneOnPrimary), func() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + ginkgo.By("Creating StorageClass for Statefulset") + var sts1Replicas, sts2Replicas, dep1ReplicaCount, dep2ReplicaCount int32 + var err error + var svcCsipods *v1.PodList + + if vanillaCluster { + ginkgo.By("CNS_TEST: Running for vanilla k8s setup") + scParameters = map[string]string{} + scParameters[scParamStoragePolicyName] = storagePolicyName + scSpec := getVSphereStorageClassSpec(defaultNginxStorageClassName, scParameters, nil, "", "", false) + sc, err = client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - ginkgo.By("Verify it's PV and corresponding volumes are deleted from CNS") - err = fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, - pollTimeout) + defer func() { + err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + } else { + ginkgo.By("CNS_TEST: Running for GC setup") + sc, err = client.StorageV1().StorageClasses().Get(ctx, storagePolicyName, metav1.GetOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - volumeHandle := pv.Spec.CSI.VolumeHandle - err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) - gomega.Expect(err).NotTo(gomega.HaveOccurred(), - fmt.Sprintf("Volume: %s should not be present in the CNS after it is deleted from "+ - "kubernetes", volumeHandle)) - } - }() - - if rwxAccessMode { - dep1ReplicaCount += 3 - dep2ReplicaCount += 3 - err = updateDeploymentReplicawithWait(client, dep1ReplicaCount, deployment1.Name, namespace) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - err = updateDeploymentReplicawithWait(client, dep2ReplicaCount, deployment2.Name, namespace) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } else { - sts1Replicas += 2 - ginkgo.By(fmt.Sprintf("Scaling up statefulset %v to number of Replica: %v", statefulset1.Name, sts1Replicas)) - fss.UpdateReplicas(ctx, client, statefulset1, sts1Replicas) + } - sts2Replicas -= 2 - ginkgo.By(fmt.Sprintf("Scaling down statefulset: %v to number of Replica: %v", statefulset2.Name, sts2Replicas)) - fss.UpdateReplicas(ctx, client, statefulset2, sts2Replicas) - } + ginkgo.By("Creating service") + service := CreateService(namespace, client) + defer func() { + deleteService(namespace, client, service) + }() - ginkgo.By("Bring down the primary site") - siteFailover(ctx, true) + ginkgo.By("Creating statefulsets sts1 with replica count 1 and sts2 with 5 and wait for all" + + "the replicas to be running") - defer func() { - ginkgo.By("Bring up the primary site before terminating the test") - if len(fds.hostsDown) > 0 && fds.hostsDown != nil { - siteRestore(true) - fds.hostsDown = nil + if rwxAccessMode { + dep1ReplicaCount = 3 + dep2ReplicaCount = 5 + } else { + dep1ReplicaCount = 1 + dep2ReplicaCount = 1 } - }() - - ginkgo.By("Wait for k8s cluster to be healthy") - if vanillaCluster { - wait4AllK8sNodesToBeUp(ctx, client, nodeList) - } - if vanillaCluster && guestCluster { - err = waitForAllNodes2BeReady(ctx, client) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } + sts1Replicas = 1 + sts2Replicas = 5 + statefulset1, deployment1, _ := createStsDeployment(ctx, client, namespace, sc, true, + false, sts1Replicas, "web", dep1ReplicaCount, accessMode) + statefulset2, deployment2, _ := createStsDeployment(ctx, client, namespace, sc, true, + true, sts2Replicas, "web-nginx", dep2ReplicaCount, accessMode) + ss2PodsBeforeScaleDown := fss.GetPodList(ctx, client, statefulset2) - time.Sleep(5 * time.Minute) - if guestCluster { - ginkgo.By("Check if csi pods are running fine after site failure in supervisor") - // Check if csi pods are running fine after site failure - err = fpod.WaitForPodsRunningReady(ctx, svcClient, csiNs, int32(svcCsipods.Size()), 0, pollTimeout) + csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } - // Check if csi pods are running fine after site failure - ginkgo.By("Check if csi pods are running fine after site failure") - err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + if guestCluster { + svcCsipods, err = svcClient.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } - ginkgo.By("Verifying volume lifecycle actions works fine") - volumeLifecycleActions(ctx, client, namespace, sc, "") + defer func() { + scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) + pvcs, err := client.CoreV1().PersistentVolumeClaims(namespace).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for _, claim := range pvcs.Items { + pv := getPvFromClaim(client, namespace, claim.Name) + err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Verify it's PV and corresponding volumes are deleted from CNS") + err = fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, + pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + volumeHandle := pv.Spec.CSI.VolumeHandle + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred(), + fmt.Sprintf("Volume: %s should not be present in the CNS after it is deleted from "+ + "kubernetes", volumeHandle)) + } + }() - // Statefulset and deployments in PodVM might got to Terminating state as - // the nodes attached to these pods might become inaccessible during site failure. - // Hence validating these steps once site is restored back. - if !supervisorCluster { if rwxAccessMode { dep1ReplicaCount += 3 - err = updateDeploymentReplicawithWait(client, dep2ReplicaCount, deployment2.Name, namespace) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - verifyVolumeMetadataOnDeployments(ctx, client, deployment2, namespace, nil, nil, - nil, "") dep2ReplicaCount += 3 + err = updateDeploymentReplicawithWait(client, dep1ReplicaCount, deployment1.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) err = updateDeploymentReplicawithWait(client, dep2ReplicaCount, deployment2.Name, namespace) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - verifyVolumeMetadataOnDeployments(ctx, client, deployment2, namespace, nil, nil, - nil, "") + } else { - ginkgo.By("Verifying statefulset scale up/down went fine on sts1 and sts2") - // Scale up replicas of statefulset1 and verify CNS entries for volumes - scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset1, - sts1Replicas, false, true) - // Scale down replicas of statefulset2 and verify CNS entries for volumes - scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset2, - ss2PodsBeforeScaleDown, sts2Replicas, false, true) - - // Scaling up statefulset sts1 sts1Replicas += 2 - scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset1, - sts1Replicas, true, false) + ginkgo.By(fmt.Sprintf("Scaling up statefulset %v to number of Replica: %v", statefulset1.Name, sts1Replicas)) + fss.UpdateReplicas(ctx, client, statefulset1, sts1Replicas) - // Scaling down statefulset sts2 sts2Replicas -= 2 - scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset2, - ss2PodsBeforeScaleDown, sts2Replicas, true, false) + ginkgo.By(fmt.Sprintf("Scaling down statefulset: %v to number of Replica: %v", statefulset2.Name, sts2Replicas)) + fss.UpdateReplicas(ctx, client, statefulset2, sts2Replicas) } - } - ginkgo.By("Bring up the primary site") - if len(fds.hostsDown) > 0 && fds.hostsDown != nil { - siteRestore(true) - fds.hostsDown = nil - } + ginkgo.By("Bring down the primary site") + siteFailover(ctx, true) - if guestCluster { - ginkgo.By("Check for nodes to be in Ready state in supervisor") - wait4AllK8sNodesToBeUp(ctx, svcClient, svcNodeList) - err = waitForAllNodes2BeReady(ctx, svcClient) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } + defer func() { + ginkgo.By("Bring up the primary site before terminating the test") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + siteRestore(true) + fds.hostsDown = nil + } + }() - ginkgo.By("Wait for k8s cluster to be healthy") - // wait for the VMs to move back - err = waitForAllNodes2BeReady(ctx, client) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Wait for k8s cluster to be healthy") + if vanillaCluster { + wait4AllK8sNodesToBeUp(nodeList) + } + if vanillaCluster && guestCluster { + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } - if !supervisorCluster { - if rwxAccessMode { - dep1ReplicaCount += 3 - err = updateDeploymentReplicawithWait(client, dep1ReplicaCount, deployment1.Name, namespace) + time.Sleep(5 * time.Minute) + if guestCluster { + ginkgo.By("Check if csi pods are running fine after site failure in supervisor") + // Check if csi pods are running fine after site failure + err = fpod.WaitForPodsRunningReady(ctx, svcClient, csiNs, int32(svcCsipods.Size()), 0, pollTimeout) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - verifyVolumeMetadataOnDeployments(ctx, client, deployment1, namespace, nil, nil, - nil, "") - dep2ReplicaCount += 3 - err = updateDeploymentReplicawithWait(client, dep2ReplicaCount, deployment2.Name, namespace) + } + + // Check if csi pods are running fine after site failure + ginkgo.By("Check if csi pods are running fine after site failure") + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Verifying volume lifecycle actions works fine") + volumeLifecycleActions(ctx, client, namespace, sc, "") + + // Statefulset and deployments in PodVM might got to Terminating state as + // the nodes attached to these pods might become inaccessible during site failure. + // Hence validating these steps once site is restored back. + if !supervisorCluster { + if rwxAccessMode { + dep1ReplicaCount += 3 + err = updateDeploymentReplicawithWait(client, dep2ReplicaCount, deployment2.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + verifyVolumeMetadataOnDeployments(ctx, client, deployment2, namespace, nil, nil, + nil, "") + dep2ReplicaCount += 3 + err = updateDeploymentReplicawithWait(client, dep2ReplicaCount, deployment2.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + verifyVolumeMetadataOnDeployments(ctx, client, deployment2, namespace, nil, nil, + nil, "") + } else { + ginkgo.By("Verifying statefulset scale up/down went fine on sts1 and sts2") + // Scale up replicas of statefulset1 and verify CNS entries for volumes + scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset1, + sts1Replicas, false, true) + // Scale down replicas of statefulset2 and verify CNS entries for volumes + scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset2, + ss2PodsBeforeScaleDown, sts2Replicas, false, true) + + // Scaling up statefulset sts1 + sts1Replicas += 2 + scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset1, + sts1Replicas, true, false) + + // Scaling down statefulset sts2 + sts2Replicas -= 2 + scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset2, + ss2PodsBeforeScaleDown, sts2Replicas, true, false) + } + } + + ginkgo.By("Bring up the primary site") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + siteRestore(true) + fds.hostsDown = nil + } + + if guestCluster { + ginkgo.By("Check for nodes to be in Ready state in supervisor") + wait4AllK8sNodesToBeUp(nodeList) + err = waitForAllNodes2BeReady(ctx, svcClient) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - verifyVolumeMetadataOnDeployments(ctx, client, deployment2, namespace, nil, nil, - nil, "") - } else { - ginkgo.By("Verifying statefulset scale up/down went fine on sts1 and sts2") - // Scale up replicas of statefulset1 and verify CNS entries for volumes - scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset1, - sts1Replicas, false, true) - // Scale down replicas of statefulset2 and verify CNS entries for volumes - scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset2, - ss2PodsBeforeScaleDown, sts2Replicas, false, true) - - // Scaling up statefulset sts1 - sts1Replicas += 2 - scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset1, - sts1Replicas, true, false) + } - // Scaling down statefulset sts2 - sts2Replicas -= 2 - scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset2, - ss2PodsBeforeScaleDown, sts2Replicas, true, false) + ginkgo.By("Wait for k8s cluster to be healthy") + // wait for the VMs to move back + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + if !supervisorCluster { + if rwxAccessMode { + dep1ReplicaCount += 3 + err = updateDeploymentReplicawithWait(client, dep1ReplicaCount, deployment1.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + verifyVolumeMetadataOnDeployments(ctx, client, deployment1, namespace, nil, nil, + nil, "") + dep2ReplicaCount += 3 + err = updateDeploymentReplicawithWait(client, dep2ReplicaCount, deployment2.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + verifyVolumeMetadataOnDeployments(ctx, client, deployment2, namespace, nil, nil, + nil, "") + } else { + ginkgo.By("Verifying statefulset scale up/down went fine on sts1 and sts2") + // Scale up replicas of statefulset1 and verify CNS entries for volumes + scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset1, + sts1Replicas, false, true) + // Scale down replicas of statefulset2 and verify CNS entries for volumes + scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset2, + ss2PodsBeforeScaleDown, sts2Replicas, false, true) + + // Scaling up statefulset sts1 + sts1Replicas += 2 + scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset1, + sts1Replicas, true, false) + + // Scaling down statefulset sts2 + sts2Replicas -= 2 + scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset2, + ss2PodsBeforeScaleDown, sts2Replicas, true, false) + } } - } - }) + }) /* Pod deletion while primary site goes down @@ -670,144 +678,145 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f 10. Delete the PVCs created in step 2 */ - ginkgo.It("[primary-centric][control-plane-on-primary] Pod deletion while primary site goes down", func() { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - ginkgo.By("Creating StorageClass") - // decide which test setup is available to run - ginkgo.By("CNS_TEST: Running for vanilla k8s setup") - scParameters = map[string]string{} - scParameters["StoragePolicyName"] = storagePolicyName - storageClassName = "nginx-sc-default" - if os.Getenv(envFullSyncWaitTime) != "" { - fullSyncWaitTime, err := strconv.Atoi(os.Getenv(envFullSyncWaitTime)) - framework.Logf("Full-Sync interval time value is = %v", fullSyncWaitTime) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } else { - fullSyncWaitTime = defaultFullSyncWaitTime - } - var pods []*v1.Pod - var pvclaims []*v1.PersistentVolumeClaim = make([]*v1.PersistentVolumeClaim, volumeOpsScale) - - scSpec := getVSphereStorageClassSpec(storageClassName, scParameters, nil, "", "", false) - sc, err := client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - defer func() { - err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }() + ginkgo.It("[primary-centric][control-plane-on-primary] Pod deletion while primary site goes down", + ginkgo.Label(p0, vsanStretch, block, vanilla, primaryCentric, controlPlaneOnPrimary), func() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + ginkgo.By("Creating StorageClass") + // decide which test setup is available to run + ginkgo.By("CNS_TEST: Running for vanilla k8s setup") + scParameters = map[string]string{} + scParameters["StoragePolicyName"] = storagePolicyName + storageClassName = "nginx-sc-default" + if os.Getenv(envFullSyncWaitTime) != "" { + fullSyncWaitTime, err := strconv.Atoi(os.Getenv(envFullSyncWaitTime)) + framework.Logf("Full-Sync interval time value is = %v", fullSyncWaitTime) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } else { + fullSyncWaitTime = defaultFullSyncWaitTime + } + var pods []*v1.Pod + var pvclaims []*v1.PersistentVolumeClaim = make([]*v1.PersistentVolumeClaim, volumeOpsScale) - for i := 0; i < volumeOpsScale; i++ { - framework.Logf("Creating pvc") - pvclaims[i], err = createPVC(ctx, client, namespace, nil, diskSize, sc, "") + scSpec := getVSphereStorageClassSpec(storageClassName, scParameters, nil, "", "", false) + sc, err := client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } + defer func() { + err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() - persistentvolumes, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaims, framework.ClaimProvisionTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - for i := 0; i < volumeOpsScale; i++ { - volHandle := persistentvolumes[i].Spec.CSI.VolumeHandle - gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) - } - defer func() { - for _, claim := range pvclaims { - err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) + for i := 0; i < volumeOpsScale; i++ { + framework.Logf("Creating pvc") + pvclaims[i], err = createPVC(ctx, client, namespace, nil, diskSize, sc, "") gomega.Expect(err).NotTo(gomega.HaveOccurred()) } - ginkgo.By("Verify PVs, volumes are deleted from CNS") - for _, pv := range persistentvolumes { - volumeHandle := pv.Spec.CSI.VolumeHandle - err := fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, - pollTimeout) - errMsg := "The object or item referred to could not be found" - if err != nil && checkForEventWithMessage(client, "", pv.Name, errMsg) { - framework.Logf("Persistent Volume %v still not deleted with err %v", pv.Name, errMsg) - // Orphan volumes may be left over here, hence logging those PVs and ignoring the error for now. - _ = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) - framework.Logf("Volume %v still not deleted from CNS with err %v", pv.Name, errMsg) - } else { - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + + persistentvolumes, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaims, framework.ClaimProvisionTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for i := 0; i < volumeOpsScale; i++ { + volHandle := persistentvolumes[i].Spec.CSI.VolumeHandle + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + } + defer func() { + for _, claim := range pvclaims { + err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) gomega.Expect(err).NotTo(gomega.HaveOccurred()) } + ginkgo.By("Verify PVs, volumes are deleted from CNS") + for _, pv := range persistentvolumes { + volumeHandle := pv.Spec.CSI.VolumeHandle + err := fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, + pollTimeout) + errMsg := "The object or item referred to could not be found" + if err != nil && checkForEventWithMessage(client, "", pv.Name, errMsg) { + framework.Logf("Persistent Volume %v still not deleted with err %v", pv.Name, errMsg) + // Orphan volumes may be left over here, hence logging those PVs and ignoring the error for now. + _ = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + framework.Logf("Volume %v still not deleted from CNS with err %v", pv.Name, errMsg) + } else { + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } - } - }() + } + }() - ginkgo.By("Create pods") - for i := 0; i < volumeOpsScale; i++ { - pod, err := createPod(ctx, client, namespace, - nil, []*v1.PersistentVolumeClaim{pvclaims[i]}, - false, execCommand) - framework.Logf("Created pod %s", pod.Name) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - pods = append(pods, pod) - } - defer func() { - for _, pod := range pods { - err = fpod.DeletePodWithWait(ctx, client, pod) + ginkgo.By("Create pods") + for i := 0; i < volumeOpsScale; i++ { + pod, err := createPod(ctx, client, namespace, + nil, []*v1.PersistentVolumeClaim{pvclaims[i]}, + false, execCommand) + framework.Logf("Created pod %s", pod.Name) gomega.Expect(err).NotTo(gomega.HaveOccurred()) + pods = append(pods, pod) } - }() + defer func() { + for _, pod := range pods { + err = fpod.DeletePodWithWait(ctx, client, pod) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + }() - // Get the list of csi pods running in CSI namespace - csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + // Get the list of csi pods running in CSI namespace + csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) - ginkgo.By("Bring down the primary site while deleting pods") - var wg sync.WaitGroup - wg.Add(2) - go deletePodsInParallel(ctx, client, namespace, pods, &wg) - go siteFailureInParallel(ctx, true, &wg) - wg.Wait() + ginkgo.By("Bring down the primary site while deleting pods") + var wg sync.WaitGroup + wg.Add(2) + go deletePodsInParallel(ctx, client, namespace, pods, &wg) + go siteFailureInParallel(ctx, true, &wg) + wg.Wait() - defer func() { - ginkgo.By("Bring up the primary site before terminating the test") - if len(fds.hostsDown) > 0 && fds.hostsDown != nil { - siteRestore(true) - fds.hostsDown = nil - } - }() + defer func() { + ginkgo.By("Bring up the primary site before terminating the test") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + siteRestore(true) + fds.hostsDown = nil + } + }() - ginkgo.By("Wait for k8s cluster to be healthy") - wait4AllK8sNodesToBeUp(ctx, client, nodeList) - err = waitForAllNodes2BeReady(ctx, client) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Wait for k8s cluster to be healthy") + wait4AllK8sNodesToBeUp(nodeList) + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) - // Check if csi pods are running fine after site failure - err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + // Check if csi pods are running fine after site failure + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) - framework.Logf("Sleeping full-sync interval for all the pod Metadata " + - "to be deleted") - time.Sleep(time.Duration(fullSyncWaitTime) * time.Second) + framework.Logf("Sleeping full-sync interval for all the pod Metadata " + + "to be deleted") + time.Sleep(time.Duration(fullSyncWaitTime) * time.Second) - ginkgo.By("Verify volume is detached from the node") - for i := 0; i < volumeOpsScale; i++ { - volHandle := persistentvolumes[i].Spec.CSI.VolumeHandle - isDiskDetached, err := e2eVSphere.waitForVolumeDetachedFromNode(client, volHandle, pods[i].Spec.NodeName) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - gomega.Expect(isDiskDetached).To(gomega.BeTrue(), - fmt.Sprintf("Volume %q is not detached from the node %q", volHandle, pods[i].Spec.NodeName)) + ginkgo.By("Verify volume is detached from the node") + for i := 0; i < volumeOpsScale; i++ { + volHandle := persistentvolumes[i].Spec.CSI.VolumeHandle + isDiskDetached, err := e2eVSphere.waitForVolumeDetachedFromNode(client, volHandle, pods[i].Spec.NodeName) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + gomega.Expect(isDiskDetached).To(gomega.BeTrue(), + fmt.Sprintf("Volume %q is not detached from the node %q", volHandle, pods[i].Spec.NodeName)) - } - ginkgo.By("Bring up the primary site") - if len(fds.hostsDown) > 0 && fds.hostsDown != nil { - siteRestore(true) - fds.hostsDown = nil - } + } + ginkgo.By("Bring up the primary site") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + siteRestore(true) + fds.hostsDown = nil + } - ginkgo.By("Wait for k8s cluster to be healthy") - // wait for the VMs to move back - err = waitForAllNodes2BeReady(ctx, client) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Wait for k8s cluster to be healthy") + // wait for the VMs to move back + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }) + }) /* PVC creation while primary site goes down Steps: - 1. Configure a vanilla multi-master K8s cluster with inter and intra site replication + 1. Configure a vsan stretched cluster testbed. 2. Create 30 PVCs using a thick provision policy so that it takes some time for PVC creation to go through 3. Bring down primary site 4. Verify that the VMs on the primary site are started up on the other esx servers in the secondary site @@ -815,117 +824,149 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f 6. Bring primary site up and wait for testbed to be back to normal 7. Delete PVCs created in step 2 */ - ginkgo.It("[primary-centric] PVC creation while primary site goes down", func() { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - ginkgo.By("Creating StorageClass") - // decide which test setup is available to run - ginkgo.By("CNS_TEST: Running for vanilla k8s setup") - scParameters = map[string]string{} - scParameters["StoragePolicyName"] = storageThickPolicyName - storageClassName = "nginx-sc-thick" - var pvclaims []*v1.PersistentVolumeClaim - - scSpec := getVSphereStorageClassSpec(storageClassName, scParameters, nil, "", "", false) - sc, err := client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - defer func() { - err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + ginkgo.It("PVC creation while primary site goes down", + ginkgo.Label(p0, vsanStretch, block, vanilla, wcp, tkg, primaryCentric), func() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + var svcCsipods, csipods *v1.PodList + if vanillaCluster { + scParameters = map[string]string{} + storageClassName = "nginx-sc-thick" + scParameters[scParamStoragePolicyName] = storagePolicyName + scSpec := getVSphereStorageClassSpec(defaultNginxStorageClassName, scParameters, nil, "", "", false) + sc, err = client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + defer func() { + err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + } else { + sc, err = client.StorageV1().StorageClasses().Get(ctx, storagePolicyName, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + ginkgo.By("Get csi pods list before bringing down the site") + if guestCluster { + svcCsipods, err = svcClient.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + csipods, err = client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }() - // Get the list of csi pods running in CSI namespace - csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Bring down the primary site while creating pvcs") + var wg sync.WaitGroup + ch := make(chan *v1.PersistentVolumeClaim) + lock := &sync.Mutex{} + wg.Add(2) + go createPvcInParallel(ctx, client, namespace, diskSize, sc, ch, lock, &wg, volumeOpsScale) + go func() { + for v := range ch { + pvclaims = append(pvclaims, v) + } + }() + go siteFailureInParallel(ctx, true, &wg) + wg.Wait() + close(ch) + + defer func() { + ginkgo.By("Bring up the primary site before terminating the test") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + siteRestore(true) + fds.hostsDown = nil + } + }() + + defer func() { + pvcs, err := client.CoreV1().PersistentVolumeClaims(namespace).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for _, claim := range pvcs.Items { + pv := getPvFromClaim(client, namespace, claim.Name) + err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Verify it's PV and corresponding volumes are deleted from CNS") + err = fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, + pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + volumeHandle := pv.Spec.CSI.VolumeHandle + if guestCluster { + volumeHandle = getVolumeIDFromSupervisorCluster(pv.Spec.CSI.VolumeHandle) + gomega.Expect(volumeHandle).NotTo(gomega.BeEmpty()) + } + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + }() - ginkgo.By("Bring down the primary site while creating pvcs") - var wg sync.WaitGroup - ch := make(chan *v1.PersistentVolumeClaim) - lock := &sync.Mutex{} - wg.Add(2) - go createPvcInParallel(ctx, client, namespace, diskSize, sc, ch, lock, &wg, volumeOpsScale) - go func() { - for v := range ch { - pvclaims = append(pvclaims, v) + if vanillaCluster { + wait4AllK8sNodesToBeUp(nodeList) + } + if vanillaCluster && guestCluster { + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) } - }() - go siteFailureInParallel(ctx, true, &wg) - wg.Wait() - close(ch) - defer func() { - ginkgo.By("Bring up the primary site before terminating the test") - if len(fds.hostsDown) > 0 && fds.hostsDown != nil { - siteRestore(true) - fds.hostsDown = nil + time.Sleep(5 * time.Minute) + if guestCluster { + ginkgo.By("Check for csi pods to be in Ready state in supervisor") + err = fpod.WaitForPodsRunningReady(ctx, svcClient, csiNs, int32(svcCsipods.Size()), 0, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) } - }() - defer func() { - pvcs, err := client.CoreV1().PersistentVolumeClaims(namespace).List(ctx, metav1.ListOptions{}) + ginkgo.By("Check if csi pods are running fine after site failure") + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - for _, claim := range pvcs.Items { - pv := getPvFromClaim(client, namespace, claim.Name) - err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) + + persistentvolumes, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaims, framework.ClaimProvisionTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for i := 0; i < volumeOpsScale; i++ { + volHandle := persistentvolumes[i].Spec.CSI.VolumeHandle + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + if guestCluster { + volHandle = getVolumeIDFromSupervisorCluster(persistentvolumes[i].Spec.CSI.VolumeHandle) + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + svcPVCName := persistentvolumes[i].Spec.CSI.VolumeHandle + err = waitAndVerifyCnsVolumeMetadata4GCVol(ctx, volHandle, svcPVCName, pvclaims[i], + persistentvolumes[i], nil) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } else { + err = waitAndVerifyCnsVolumeMetadata(ctx, volHandle, pvclaims[i], persistentvolumes[i], nil) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + } + + ginkgo.By("Delete all PVCs created in this test") + for _, pvclaim := range pvclaims { + err = fpv.DeletePersistentVolumeClaim(ctx, client, pvclaim.Name, namespace) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - ginkgo.By("Verify it's PV and corresponding volumes are deleted from CNS") - err = fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, + } + ginkgo.By("Verify PVs, volumes are deleted from CNS") + for _, pv := range persistentvolumes { + err := fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, pollTimeout) gomega.Expect(err).NotTo(gomega.HaveOccurred()) volumeHandle := pv.Spec.CSI.VolumeHandle err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) gomega.Expect(err).NotTo(gomega.HaveOccurred()) } - }() - - ginkgo.By("Wait for k8s cluster to be healthy") - wait4AllK8sNodesToBeUp(ctx, client, nodeList) - err = waitForAllNodes2BeReady(ctx, client) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - - // Check if csi pods are running fine after site failure - err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + // TODO: List orphan volumes - persistentvolumes, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaims, framework.ClaimProvisionTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - for i := 0; i < volumeOpsScale; i++ { - volHandle := persistentvolumes[i].Spec.CSI.VolumeHandle - gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) - } + ginkgo.By("Bring up the primary site") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + siteRestore(true) + fds.hostsDown = nil + } - for _, pvclaim := range pvclaims { - err = fpv.DeletePersistentVolumeClaim(ctx, client, pvclaim.Name, namespace) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } - ginkgo.By("Verify PVs, volumes are deleted from CNS") - for _, pv := range persistentvolumes { - err := fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, - pollTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - volumeHandle := pv.Spec.CSI.VolumeHandle - err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + err = waitForAllNodes2BeReady(ctx, client) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } - // TODO: List orphan volumes - - ginkgo.By("Bring up the primary site") - if len(fds.hostsDown) > 0 && fds.hostsDown != nil { - siteRestore(true) - fds.hostsDown = nil - } - - ginkgo.By("Wait for k8s cluster to be healthy") - // wait for the VMs to move back - err = waitForAllNodes2BeReady(ctx, client) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }) + }) /* Primary site network isolation Steps: - 1. Configure a vanilla multi-master K8s cluster with inter and intra site replication + 1. Configure a vsan stretched cluster testbed. 2. Create a statefulset, deployment with volumes from the stretched datastore 3. Isolate primary site from witness and secondary site 4. Verify that the VMs hosted by esx servers are brought up on the other site @@ -934,139 +975,149 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f 6. Re-establish primary site network and wait for testbed to be back to normal 7. Delete all objects created in step 2 */ - ginkgo.It("[primary-centric][control-plane-on-primary][distributed] Primary site network isolation", func() { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - ginkgo.By("Creating StorageClass for Statefulset") - // decide which test setup is available to run - ginkgo.By("CNS_TEST: Running for vanilla k8s setup") - scParameters = map[string]string{} - scParameters["StoragePolicyName"] = storagePolicyName - storageClassName = "nginx-sc-default" - - scSpec := getVSphereStorageClassSpec(storageClassName, scParameters, nil, "", "", false) - sc, err := client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - defer func() { - err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }() + ginkgo.It("Primary site network isolation", + ginkgo.Label(p0, vsanStretch, block, vanilla, wcp, tkg, primaryCentric, controlPlaneOnPrimary, distributed), func() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() - ginkgo.By("Creating service") - service := CreateService(namespace, client) - defer func() { - deleteService(namespace, client, service) - }() - ginkgo.By("Creating statefulset and deployment with volumes from the stretched datastore") - statefulset, deployment, _ := createStsDeployment(ctx, client, namespace, sc, true, - false, 3, "", 1, "") - ssPodsBeforeScaleDown := fss.GetPodList(ctx, client, statefulset) - replicas := *(statefulset.Spec.Replicas) + var stsReplicas int32 + var statefulset *appsv1.StatefulSet + var svcCsipods, csipods *v1.PodList - defer func() { - pvcs, err := client.CoreV1().PersistentVolumeClaims(namespace).List(ctx, metav1.ListOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - for _, claim := range pvcs.Items { - pv := getPvFromClaim(client, namespace, claim.Name) - err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) + if rwxAccessMode { + stsReplicas = 3 + } else { + stsReplicas = 4 + } + + ginkgo.By("Creating StorageClass") + if vanillaCluster { + ginkgo.By("CNS_TEST: Running for vanilla k8s setup") + scParameters = map[string]string{} + scParameters["StoragePolicyName"] = storagePolicyName + scSpec := getVSphereStorageClassSpec(defaultNginxStorageClassName, scParameters, nil, "", "", false) + sc, err = client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - ginkgo.By("Verify it's PV and corresponding volumes are deleted from CNS") - volumeHandle := pv.Spec.CSI.VolumeHandle - err = fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, - pollTimeout) - errMsg := "The object or item referred to could not be found" - if err != nil && checkForEventWithMessage(client, "", pv.Name, errMsg) { - framework.Logf("Persistent Volume %v still not deleted with err %v", pv.Name, errMsg) - // Orphan volumes may be left over here, hence logging those PVs and ignoring the error for now. - _ = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) - framework.Logf("Volume %v still not deleted from CNS with err %v", pv.Name, errMsg) - } else { + defer func() { + err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + }() + } else if guestCluster { + ginkgo.By("CNS_TEST: Running for GC setup") + sc, err = client.StorageV1().StorageClasses().Get(ctx, storagePolicyName, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + ginkgo.By("Creating service") + service := CreateService(namespace, client) + defer func() { + deleteService(namespace, client, service) + }() + + ginkgo.By("Creating statefulset and deployment with volumes from the stretched datastore") + statefulset, deployment, _ := createStsDeployment(ctx, client, namespace, sc, true, + false, 3, "", 1, "") + ssPodsBeforeScaleDown := fss.GetPodList(ctx, client, statefulset) + defer func() { + pvcs, err := client.CoreV1().PersistentVolumeClaims(namespace).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for _, claim := range pvcs.Items { + pv := getPvFromClaim(client, namespace, claim.Name) + err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Verify it's PV and corresponding volumes are deleted from CNS") + volumeHandle := pv.Spec.CSI.VolumeHandle + if guestCluster { + volumeHandle = getVolumeIDFromSupervisorCluster(pv.Spec.CSI.VolumeHandle) + gomega.Expect(volumeHandle).NotTo(gomega.BeEmpty()) + } + err = fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, + pollTimeout) + errMsg := "The object or item referred to could not be found" + if err != nil && checkForEventWithMessage(client, "", pv.Name, errMsg) { + framework.Logf("Persistent Volume %v still not deleted with err %v", pv.Name, errMsg) + // Orphan volumes may be left over here, hence logging those PVs and ignoring the error for now. + _ = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + framework.Logf("Volume %v still not deleted from CNS with err %v", pv.Name, errMsg) + } else { + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } } + }() + + ginkgo.By("Get csi pods list before bringing down the site") + if guestCluster { + svcCsipods, err = svcClient.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) } - }() - // Get the list of csi pods running in CSI namespace - csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + csipods, err = client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) - // Cause a network failure on primary site - ginkgo.By("Isolate primary site from witness and secondary site") - siteNetworkFailure(true, false) + // Cause a network failure on primary site + ginkgo.By("Isolate primary site from witness and secondary site") + siteNetworkFailure(true, false) + defer func() { + ginkgo.By("Bring up the primary site before terminating the test") + siteNetworkFailure(true, true) + }() - defer func() { - ginkgo.By("Bring up the primary site before terminating the test") - siteNetworkFailure(true, true) - }() + if guestCluster { + ginkgo.By("Check if csi pods are running fine after site failure in supervisor") + err = fpod.WaitForPodsRunningReady(ctx, svcClient, csiNs, int32(svcCsipods.Size()), 0, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } - ginkgo.By("Wait for k8s cluster to be healthy") - wait4AllK8sNodesToBeUp(ctx, client, nodeList) - err = waitForAllNodes2BeReady(ctx, client) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Check for nodes to be in Ready state") + if vanillaCluster { + wait4AllK8sNodesToBeUp(nodeList) + } + if guestCluster || vanillaCluster { + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } - // Check if csi pods are running fine after network failure - err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Check if csi pods are running fine after site failure") + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) - ginkgo.By("Checking if volumes and pods post network failure are healthy") - pods, err := fdep.GetPodsForDeployment(ctx, client, deployment) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - pod := pods.Items[0] - err = fpod.WaitForPodNameRunningInNamespace(ctx, client, pod.Name, namespace) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Checking if volumes and pods post network failure are healthy") + pods, err := fdep.GetPodsForDeployment(ctx, client, deployment) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + pod := pods.Items[0] + err = fpod.WaitForPodNameRunningInNamespace(ctx, client, pod.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) - fss.WaitForStatusReadyReplicas(ctx, client, statefulset, replicas) - gomega.Expect(fss.CheckMount(ctx, client, statefulset, mountPath)).NotTo(gomega.HaveOccurred()) + fss.WaitForStatusReadyReplicas(ctx, client, statefulset, stsReplicas) + gomega.Expect(fss.CheckMount(ctx, client, statefulset, mountPath)).NotTo(gomega.HaveOccurred()) - ginkgo.By("Verifying volume lifecycle actions works fine") - volumeLifecycleActions(ctx, client, namespace, sc, "") - // Scale down replicas of statefulset and verify CNS entries for volumes - scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset, - ssPodsBeforeScaleDown, replicas-1, true, true) - // Scale up replicas of statefulset and verify CNS entries for volumes - scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset, - replicas, true, true) + ginkgo.By("Verifying volume lifecycle actions works fine") + volumeLifecycleActions(ctx, client, namespace, sc, "") + // Scale down replicas of statefulset and verify CNS entries for volumes + scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset, + ssPodsBeforeScaleDown, stsReplicas, true, true) - ginkgo.By("Bring up the primary site") - siteNetworkFailure(true, true) + ginkgo.By("Performing scaleup operation on statefulset when site is down") + stsReplicas = 6 + scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset, + stsReplicas, true, true) - ginkgo.By("Wait for k8s cluster to be healthy") - // wait for the VMs to move back - err = waitForAllNodes2BeReady(ctx, client) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Bring up the primary site") + siteNetworkFailure(true, true) - scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) - pvcs, err := client.CoreV1().PersistentVolumeClaims(namespace).List(ctx, metav1.ListOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - for _, claim := range pvcs.Items { - pv := getPvFromClaim(client, namespace, claim.Name) - volumeHandle := pv.Spec.CSI.VolumeHandle - err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - ginkgo.By("Verify it's PV and corresponding volumes are deleted from CNS") - err = fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, - pollTimeout) - errMsg := "The object or item referred to could not be found" - if err != nil && checkForEventWithMessage(client, "", pv.Name, errMsg) { - framework.Logf("Persistent Volume %v still not deleted with err %v", pv.Name, errMsg) - // Orphan volumes may be left over here, hence logging those PVs and ignoring the error for now. - _ = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) - framework.Logf("Volume %v still not deleted from CNS with err %v", pv.Name, errMsg) - } else { - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } - }) + ginkgo.By("Scale down statefulset and deployment after site recovery") + scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) + }) /* PVC deletion while primary site goes down Steps: - 1. Configure a vanilla multi-master K8s cluster with inter and intra site replication + 1. Configure a vsan stretched cluster testbed. 2. Create 30 PVCs and wait for each pvc to bind to its PV 3. Delete the PVCs created in step2 4. Bring down primary site @@ -1074,98 +1125,129 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f 6. Verify PVs and CNS volumes associated with PVCs created in step 2 are also deleted successfully 7. Bring primary site up and wait for testbed to be back to normal */ - ginkgo.It("[primary-centric] PVC deletion while primary site goes down", func() { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - ginkgo.By("Creating StorageClass") - // decide which test setup is available to run - ginkgo.By("CNS_TEST: Running for vanilla k8s setup") - scParameters = map[string]string{} - scParameters["StoragePolicyName"] = storagePolicyName - storageClassName = "nginx-sc-default" - var pvclaims []*v1.PersistentVolumeClaim = make([]*v1.PersistentVolumeClaim, volumeOpsScale) - - scSpec := getVSphereStorageClassSpec(storageClassName, scParameters, nil, "", "", false) - sc, err := client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - defer func() { - err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }() + ginkgo.It("[primary-centric] PVC deletion while primary site goes down", + ginkgo.Label(p0, vsanStretch, block, vanilla, wcp, tkg, primaryCentric), func() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + var svcCsipods *v1.PodList + ginkgo.By("Creating StorageClass") + // decide which test setup is available to run + + if vanillaCluster { + ginkgo.By("CNS_TEST: Running for vanilla k8s setup") + scParameters = map[string]string{} + scParameters["StoragePolicyName"] = storagePolicyName + scSpec := getVSphereStorageClassSpec(defaultNginxStorageClassName, scParameters, nil, "", "", false) + sc, err = client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + defer func() { + err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + } else { + ginkgo.By("CNS_TEST: Running for GC setup") + sc, err = client.StorageV1().StorageClasses().Get(ctx, storagePolicyName, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } - for i := 0; i < volumeOpsScale; i++ { - framework.Logf("Creating pvc %v", i) - pvclaims[i], err = createPVC(ctx, client, namespace, nil, diskSize, sc, "") - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } + var pvclaims []*v1.PersistentVolumeClaim = make([]*v1.PersistentVolumeClaim, volumeOpsScale) + for i := 0; i < volumeOpsScale; i++ { + framework.Logf("Creating pvc %v", i) + pvclaims[i], err = createPVC(ctx, client, namespace, nil, diskSize, sc, "") + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } - persistentvolumes, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaims, framework.ClaimProvisionTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - for i := 0; i < volumeOpsScale; i++ { - volHandle := persistentvolumes[i].Spec.CSI.VolumeHandle - gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) - } + persistentvolumes, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaims, framework.ClaimProvisionTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for i := 0; i < volumeOpsScale; i++ { + volHandle := persistentvolumes[i].Spec.CSI.VolumeHandle + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + if guestCluster { + volHandle = getVolumeIDFromSupervisorCluster(persistentvolumes[i].Spec.CSI.VolumeHandle) + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + svcPVCName := persistentvolumes[i].Spec.CSI.VolumeHandle + err = waitAndVerifyCnsVolumeMetadata4GCVol(ctx, volHandle, svcPVCName, pvclaims[i], + persistentvolumes[i], nil) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } else { + err = waitAndVerifyCnsVolumeMetadata(ctx, volHandle, pvclaims[i], persistentvolumes[i], nil) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + } - // Get the list of csi pods running in CSI namespace - csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - - ginkgo.By("Bring down the primary site while deleting pvcs") - var wg sync.WaitGroup - wg.Add(2) - go deletePvcInParallel(ctx, client, pvclaims, namespace, &wg) - go siteFailureInParallel(ctx, true, &wg) - wg.Wait() - - defer func() { - ginkgo.By("Bring up the primary site before terminating the test") - if len(fds.hostsDown) > 0 && fds.hostsDown != nil { - siteRestore(true) - fds.hostsDown = nil + if guestCluster { + svcCsipods, err = svcClient.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) } - }() + // Get the list of csi pods running in CSI namespace + csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) - ginkgo.By("Wait for k8s cluster to be healthy") - wait4AllK8sNodesToBeUp(ctx, client, nodeList) - err = waitForAllNodes2BeReady(ctx, client) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Bring down the primary site while deleting pvcs") + var wg sync.WaitGroup + wg.Add(2) + go deletePvcInParallel(ctx, client, pvclaims, namespace, &wg) + go siteFailureInParallel(ctx, true, &wg) + wg.Wait() - // Check if csi pods are running fine after site failure - err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + defer func() { + ginkgo.By("Bring up the primary site before terminating the test") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + siteRestore(true) + fds.hostsDown = nil + } + }() - ginkgo.By("Verify PVs, volumes are deleted from CNS") - for _, pv := range persistentvolumes { - volumeHandle := pv.Spec.CSI.VolumeHandle - err := fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, - pollTimeout) - errMsg := "The object or item referred to could not be found" - if err != nil && checkForEventWithMessage(client, "", pv.Name, errMsg) { - framework.Logf("Persistent Volume %v still not deleted with err %v", pv.Name, errMsg) - // Orphan volumes may be left over here, hence logging those PVs and ignoring the error for now. - _ = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) - framework.Logf("Volume %v still not deleted from CNS with err %v", pv.Name, errMsg) - } else { + ginkgo.By("Wait for k8s cluster to be healthy") + if vanillaCluster { + wait4AllK8sNodesToBeUp(nodeList) + } + if vanillaCluster || guestCluster { + err = waitForAllNodes2BeReady(ctx, client) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + } + + time.Sleep(5 * time.Minute) + if guestCluster { + err = fpod.WaitForPodsRunningReady(ctx, svcClient, csiNs, int32(svcCsipods.Size()), 0, pollTimeout) gomega.Expect(err).NotTo(gomega.HaveOccurred()) } + // Check if csi pods are running fine after site failure + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } - ginkgo.By("Bring up the primary site") - if len(fds.hostsDown) > 0 && fds.hostsDown != nil { - siteRestore(true) - fds.hostsDown = nil - } + ginkgo.By("Verify PVs, volumes are deleted from CNS") + for _, pv := range persistentvolumes { + volumeHandle := pv.Spec.CSI.VolumeHandle + err := fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, + pollTimeout) + errMsg := "The object or item referred to could not be found" + if err != nil && checkForEventWithMessage(client, "", pv.Name, errMsg) { + framework.Logf("Persistent Volume %v still not deleted with err %v", pv.Name, errMsg) + // Orphan volumes may be left over here, hence logging those PVs and ignoring the error for now. + _ = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + framework.Logf("Volume %v still not deleted from CNS with err %v", pv.Name, errMsg) + } else { + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } - siteRestore(true) + } + ginkgo.By("Bring up the primary site") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + siteRestore(true) + fds.hostsDown = nil + } - ginkgo.By("Wait for k8s cluster to be healthy") - // wait for the VMs to move back - err = waitForAllNodes2BeReady(ctx, client) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + siteRestore(true) - }) + ginkgo.By("Wait for k8s cluster to be healthy") + // wait for the VMs to move back + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + }) /* Pod creation while primary site goes down @@ -1181,127 +1263,128 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f 9. Delete PVCs created in step 2 */ - ginkgo.It("[primary-centric][control-plane-on-primary] Pod creation while primary site goes down", func() { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - ginkgo.By("Creating StorageClass") - // decide which test setup is available to run - ginkgo.By("CNS_TEST: Running for vanilla k8s setup") - scParameters = map[string]string{} - scParameters["StoragePolicyName"] = storagePolicyName - storageClassName = "nginx-sc-default" - var pods []*v1.Pod - var pvclaims []*v1.PersistentVolumeClaim = make([]*v1.PersistentVolumeClaim, volumeOpsScale) - - scSpec := getVSphereStorageClassSpec(storageClassName, scParameters, nil, "", "", false) - sc, err := client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - defer func() { - err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }() + ginkgo.It("Pod creation while primary site goes down", + ginkgo.Label(p0, vsanStretch, block, vanilla, primaryCentric, controlPlaneOnPrimary), func() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + ginkgo.By("Creating StorageClass") + // decide which test setup is available to run + ginkgo.By("CNS_TEST: Running for vanilla k8s setup") + scParameters = map[string]string{} + scParameters["StoragePolicyName"] = storagePolicyName + storageClassName = "nginx-sc-default" + var pods []*v1.Pod + var pvclaims []*v1.PersistentVolumeClaim = make([]*v1.PersistentVolumeClaim, volumeOpsScale) - for i := 0; i < volumeOpsScale; i++ { - framework.Logf("Creating pvc %v", i) - pvclaims[i], err = createPVC(ctx, client, namespace, nil, diskSize, sc, "") + scSpec := getVSphereStorageClassSpec(storageClassName, scParameters, nil, "", "", false) + sc, err := client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } - - defer func() { - for _, pvclaim := range pvclaims { - err := fpv.DeletePersistentVolumeClaim(ctx, client, pvclaim.Name, namespace) + defer func() { + err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - pvclaim = nil - } - }() + }() - persistentvolumes, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaims, framework.ClaimProvisionTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - for i := 0; i < volumeOpsScale; i++ { - volHandle := persistentvolumes[i].Spec.CSI.VolumeHandle - gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) - } - defer func() { - for _, claim := range pvclaims { - err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } - ginkgo.By("Verify PVs, volumes are deleted from CNS") - for _, pv := range persistentvolumes { - err := fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, framework.Poll, - framework.PodDeleteTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - volumeHandle := pv.Spec.CSI.VolumeHandle - err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + for i := 0; i < volumeOpsScale; i++ { + framework.Logf("Creating pvc %v", i) + pvclaims[i], err = createPVC(ctx, client, namespace, nil, diskSize, sc, "") gomega.Expect(err).NotTo(gomega.HaveOccurred()) } - }() - /// Get the list of csi pods running in CSI namespace - csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + defer func() { + for _, pvclaim := range pvclaims { + err := fpv.DeletePersistentVolumeClaim(ctx, client, pvclaim.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + pvclaim = nil + } + }() - ginkgo.By("Bring down the primary site while creating pods") - var wg sync.WaitGroup - wg.Add(2) - ch := make(chan *v1.Pod) - lock := &sync.Mutex{} - go createPodsInParallel(client, namespace, pvclaims, ctx, lock, ch, &wg, volumeOpsScale) - go func() { - for v := range ch { - pods = append(pods, v) + persistentvolumes, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaims, framework.ClaimProvisionTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for i := 0; i < volumeOpsScale; i++ { + volHandle := persistentvolumes[i].Spec.CSI.VolumeHandle + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) } - }() - go siteFailureInParallel(ctx, true, &wg) - wg.Wait() - close(ch) + defer func() { + for _, claim := range pvclaims { + err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + ginkgo.By("Verify PVs, volumes are deleted from CNS") + for _, pv := range persistentvolumes { + err := fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, framework.Poll, + framework.PodDeleteTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + volumeHandle := pv.Spec.CSI.VolumeHandle + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + }() - defer func() { - ginkgo.By("Bring up the primary site before terminating the test") - if len(fds.hostsDown) > 0 && fds.hostsDown != nil { - siteRestore(true) - fds.hostsDown = nil - } - }() + /// Get the list of csi pods running in CSI namespace + csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) - ginkgo.By("Wait for k8s cluster to be healthy") - wait4AllK8sNodesToBeUp(ctx, client, nodeList) - err = waitForAllNodes2BeReady(ctx, client) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Bring down the primary site while creating pods") + var wg sync.WaitGroup + wg.Add(2) + ch := make(chan *v1.Pod) + lock := &sync.Mutex{} + go createPodsInParallel(client, namespace, pvclaims, ctx, lock, ch, &wg, volumeOpsScale) + go func() { + for v := range ch { + pods = append(pods, v) + } + }() + go siteFailureInParallel(ctx, true, &wg) + wg.Wait() + close(ch) - // Check if csi pods are running fine after site failure - err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout*2) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + defer func() { + ginkgo.By("Bring up the primary site before terminating the test") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + siteRestore(true) + fds.hostsDown = nil + } + }() + + ginkgo.By("Wait for k8s cluster to be healthy") + wait4AllK8sNodesToBeUp(nodeList) + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) - ginkgo.By("Checking whether pods are in Running or ExitCode:0 state or Pending state") - for _, pod := range pods { - framework.Logf("Pod is %s", pod.Name) - err = waitForPodsToBeInErrorOrRunning(ctx, client, pod.Name, namespace, pollTimeout*4) + // Check if csi pods are running fine after site failure + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout*2) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } - ginkgo.By("Bring up the primary site") - if len(fds.hostsDown) > 0 && fds.hostsDown != nil { - siteRestore(true) - fds.hostsDown = nil - } + ginkgo.By("Checking whether pods are in Running or ExitCode:0 state or Pending state") + for _, pod := range pods { + framework.Logf("Pod is %s", pod.Name) + err = waitForPodsToBeInErrorOrRunning(ctx, client, pod.Name, namespace, pollTimeout*4) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } - ginkgo.By("Wait for k8s cluster to be healthy") - // wait for the VMs to move back - err = waitForAllNodes2BeReady(ctx, client) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Bring up the primary site") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + siteRestore(true) + fds.hostsDown = nil + } - for _, pod := range pods { - ginkgo.By(fmt.Sprintf("Deleting the pod %s in namespace %s", pod.Name, namespace)) - err = fpod.DeletePodWithWait(ctx, client, pod) + ginkgo.By("Wait for k8s cluster to be healthy") + // wait for the VMs to move back + err = waitForAllNodes2BeReady(ctx, client) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } - }) + + for _, pod := range pods { + ginkgo.By(fmt.Sprintf("Deleting the pod %s in namespace %s", pod.Name, namespace)) + err = fpod.DeletePodWithWait(ctx, client, pod) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + }) /* Label updates to PV, PVC, pod while primary site goes down Steps: - 1. Configure a vanilla multi-master K8s cluster with inter and intra site replication + 1. Configure a vsan stretched cluster testbed. 2. Create 30 PVCs and wait for them to be bound 3. Add labels to the PVs, PVCs 4. Bring down primary site @@ -1311,418 +1394,361 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f 8. Delete the PVCs created in step 2 9. Bring primary site up and wait for testbed to be back to normal */ - ginkgo.It("[primary-centric] Label updates to PV, PVC, pod while primary site goes down", func() { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - ginkgo.By("Creating StorageClass") - // decide which test setup is available to run - ginkgo.By("CNS_TEST: Running for vanilla k8s setup") - scParameters = map[string]string{} - scParameters["StoragePolicyName"] = storageThickPolicyName - storageClassName = "nginx-sc-default" - var pvclaims []*v1.PersistentVolumeClaim - if os.Getenv(envFullSyncWaitTime) != "" { - fullSyncWaitTime, err := strconv.Atoi(os.Getenv(envFullSyncWaitTime)) - framework.Logf("Full-Sync interval time value is = %v", fullSyncWaitTime) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } else { - fullSyncWaitTime = defaultFullSyncWaitTime - } + ginkgo.It("[primary-centric] Label updates to PV, PVC, pod while primary site goes down", + ginkgo.Label(p0, vsanStretch, block, vanilla, wcp, tkg, primaryCentric), func() { - scSpec := getVSphereStorageClassSpec(storageClassName, scParameters, nil, "", "", false) - sc, err := client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - defer func() { - err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() - for i := 0; i < volumeOpsScale; i++ { - framework.Logf("Creating pvc") - pvc, err := createPVC(ctx, client, namespace, nil, diskSize, sc, "") - pvclaims = append(pvclaims, pvc) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } + if os.Getenv(envFullSyncWaitTime) != "" { + fullSyncWaitTime, err := strconv.Atoi(os.Getenv(envFullSyncWaitTime)) + framework.Logf("Full-Sync interval time value is = %v", fullSyncWaitTime) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } else { + fullSyncWaitTime = defaultFullSyncWaitTime + } - persistentvolumes, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaims, framework.ClaimProvisionTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - for i := 0; i < volumeOpsScale; i++ { - volHandle := persistentvolumes[i].Spec.CSI.VolumeHandle - gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) - } - defer func() { - for _, claim := range pvclaims { - err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) + var pvclaims []*v1.PersistentVolumeClaim + var svcCsipods, csipods *v1.PodList + var volHandles []string + + ginkgo.By("Creating StorageClass") + if vanillaCluster { + scParameters = map[string]string{} + scParameters["StoragePolicyName"] = storagePolicyName + scSpec := getVSphereStorageClassSpec(defaultNginxStorageClassName, scParameters, nil, "", "", false) + sc, err = client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + defer func() { + err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + } else { + sc, err = client.StorageV1().StorageClasses().Get(ctx, storagePolicyName, metav1.GetOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) } - ginkgo.By("Verify PVs, volumes are deleted from CNS") - for _, pv := range persistentvolumes { - err := fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, - pollTimeout) + + for i := 0; i < volumeOpsScale; i++ { + framework.Logf("Creating pvc") + pvc, err := createPVC(ctx, client, namespace, nil, diskSize, sc, "") + pvclaims = append(pvclaims, pvc) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - volumeHandle := pv.Spec.CSI.VolumeHandle - err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) - gomega.Expect(err).NotTo(gomega.HaveOccurred(), - fmt.Sprintf("Volume: %s should not be present in the CNS after it is deleted from "+ - "kubernetes", volumeHandle)) } - }() - // Get the list of csi pods running in CSI namespace - csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + persistentvolumes, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaims, framework.ClaimProvisionTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for i := 0; i < volumeOpsScale; i++ { + volHandle := persistentvolumes[i].Spec.CSI.VolumeHandle + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + if guestCluster { + volHandle = getVolumeIDFromSupervisorCluster(persistentvolumes[i].Spec.CSI.VolumeHandle) + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + svcPVCName := persistentvolumes[i].Spec.CSI.VolumeHandle + err = waitAndVerifyCnsVolumeMetadata4GCVol(ctx, volHandle, svcPVCName, pvclaims[i], + persistentvolumes[i], nil) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } else { + err = waitAndVerifyCnsVolumeMetadata(ctx, volHandle, pvclaims[i], persistentvolumes[i], nil) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + volHandles = append(volHandles, volHandle) + } + defer func() { + for _, claim := range pvclaims { + err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + ginkgo.By("Verify PVs, volumes are deleted from CNS") + for i, pv := range persistentvolumes { + err := fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, + pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volHandles[i]) + gomega.Expect(err).NotTo(gomega.HaveOccurred(), + fmt.Sprintf("Volume: %s should not be present in the CNS after it is deleted from "+ + "kubernetes", volHandles[i])) + } + }() - ginkgo.By("Bring down the primary site while adding labels to PVCs and PVs") - var wg sync.WaitGroup - labels := make(map[string]string) - labels[labelKey] = labelValue - wg.Add(3) - go updatePvcLabelsInParallel(ctx, client, namespace, labels, pvclaims, &wg) - go updatePvLabelsInParallel(ctx, client, namespace, labels, persistentvolumes, &wg) - go siteFailureInParallel(ctx, true, &wg) - wg.Wait() + ginkgo.By("Get csi pods list before bringing down the site") + if guestCluster { + svcCsipods, err = svcClient.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } - ginkgo.By("Wait for k8s cluster to be healthy") - wait4AllK8sNodesToBeUp(ctx, client, nodeList) - err = waitForAllNodes2BeReady(ctx, client) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) - framework.Logf("Sleeping full-sync interval for volumes to be updated " + - "with labels in CNS") - time.Sleep(time.Duration(fullSyncWaitTime) * time.Second) + csipods, err = client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) - // Check if csi pods are running fine after site failure - err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Bring down the primary site while adding labels to PVCs and PVs") + var wg sync.WaitGroup + labels := make(map[string]string) + labels[labelKey] = labelValue + wg.Add(3) + go updatePvcLabelsInParallel(ctx, client, namespace, labels, pvclaims, &wg) + go updatePvLabelsInParallel(ctx, client, namespace, labels, persistentvolumes, &wg) + go siteFailureInParallel(ctx, true, &wg) + wg.Wait() - persistentvolumes, err = fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaims, framework.ClaimProvisionTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - for i := 0; i < volumeOpsScale; i++ { - volHandle := persistentvolumes[i].Spec.CSI.VolumeHandle - gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) - } + if vanillaCluster { + wait4AllK8sNodesToBeUp(nodeList) + } + if vanillaCluster || guestCluster { + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } - for _, pvc := range pvclaims { - ginkgo.By(fmt.Sprintf("Waiting for labels %+v to be updated for pvc %s in namespace %s", - labels, pvc.Name, namespace)) - pv := getPvFromClaim(client, namespace, pvc.Name) - err = e2eVSphere.waitForLabelsToBeUpdated(pv.Spec.CSI.VolumeHandle, labels, - string(cnstypes.CnsKubernetesEntityTypePVC), pvc.Name, pvc.Namespace) + time.Sleep(5 * time.Minute) + ginkgo.By("Check if csi pods are running fine after site failure") + if guestCluster { + err = fpod.WaitForPodsRunningReady(ctx, svcClient, csiNs, int32(svcCsipods.Size()), 0, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } - for _, pv := range persistentvolumes { - ginkgo.By(fmt.Sprintf("Waiting for labels %+v to be updated for pv %s", - labels, pv.Name)) - err = e2eVSphere.waitForLabelsToBeUpdated(pv.Spec.CSI.VolumeHandle, labels, - string(cnstypes.CnsKubernetesEntityTypePV), pv.Name, pv.Namespace) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } + framework.Logf("Sleeping full-sync interval for volumes to be updated " + + "with labels in CNS") + time.Sleep(time.Duration(fullSyncWaitTime) * time.Second) - for _, pvclaim := range pvclaims { - err = fpv.DeletePersistentVolumeClaim(ctx, client, pvclaim.Name, namespace) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } - ginkgo.By("Verify PVs, volumes are deleted from CNS") - for _, pv := range persistentvolumes { - err := fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, - pollTimeout) + ginkgo.By("Check if csi pods are running fine after site failure") + if guestCluster { + svcCsipods, err = svcClient.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + csipods, err = client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - volumeHandle := pv.Spec.CSI.VolumeHandle - err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + + persistentvolumes, err = fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaims, framework.ClaimProvisionTimeout) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } + for i := 0; i < volumeOpsScale; i++ { + volHandle := persistentvolumes[i].Spec.CSI.VolumeHandle + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + } - ginkgo.By("Bring up the primary site") - siteRestore(true) + for i, pvc := range pvclaims { + ginkgo.By(fmt.Sprintf("Waiting for labels %+v to be updated for pvc %s in namespace %s", + labels, pvc.Name, namespace)) + //pv := getPvFromClaim(client, namespace, pvc.Name) + err = e2eVSphere.waitForLabelsToBeUpdated(volHandles[i], labels, + string(cnstypes.CnsKubernetesEntityTypePVC), pvc.Name, pvc.Namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } - ginkgo.By("Wait for k8s cluster to be healthy") - // wait for the VMs to move back - err = waitForAllNodes2BeReady(ctx, client) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for i, pv := range persistentvolumes { + ginkgo.By(fmt.Sprintf("Waiting for labels %+v to be updated for pv %s", + labels, pv.Name)) + err = e2eVSphere.waitForLabelsToBeUpdated(volHandles[i], labels, + string(cnstypes.CnsKubernetesEntityTypePV), pv.Name, pv.Namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } - }) - /* - PVC creation while secondary site goes down and csi provisioner leader is in secondary site - Steps: - 1. Configure a vanilla multi-master K8s cluster with inter and intra site replication - 2. Ensure csi-provisioner leader is in secondary site - 3. Create 30 PVCs using a thick provision policy so that it takes some time for PVC creation to go through - 4. Bring down secondary site - 5. Verify that the VMs on the secondary site are started up on the other esx servers in the primary site - 6. Verify that the PVCs created in step 3 is bound successfully - 7. Bring secondary site up and wait for testbed to be back to normal - 9. Delete PVC created in step 3 - 10. If there is an orphan volume clean up that using cnsctl - */ - ginkgo.It("[distributed] PVC creation while secondary site goes down"+ - " and csi provisioner leader is in secondary site", func() { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - ginkgo.By("Creating StorageClass") - // decide which test setup is available to run - ginkgo.By("CNS_TEST: Running for vanilla k8s setup") - scParameters = map[string]string{} - scParameters["StoragePolicyName"] = storageThickPolicyName - storageClassName = "nginx-sc-thick" - var pvclaims []*v1.PersistentVolumeClaim - - scSpec := getVSphereStorageClassSpec(storageClassName, scParameters, nil, "", "", false) - sc, err := client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - defer func() { - err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }() - - framework.Logf("Ensuring %s leader is in secondary site", provisionerContainerName) - err = changeLeaderOfContainerToComeUpOnMaster(ctx, client, sshClientConfig, provisionerContainerName, false) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - - // Get the list of csi pods running in CSI namespace - csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - - ginkgo.By("Bring down the secondary site while creating pvcs") - var wg sync.WaitGroup - ch := make(chan *v1.PersistentVolumeClaim) - lock := &sync.Mutex{} - wg.Add(2) - go createPvcInParallel(ctx, client, namespace, diskSize, sc, ch, lock, &wg, volumeOpsScale) - go func() { - for v := range ch { - pvclaims = append(pvclaims, v) - } - }() - go siteFailureInParallel(ctx, false, &wg) - wg.Wait() - close(ch) - - defer func() { for _, pvclaim := range pvclaims { - err := fpv.DeletePersistentVolumeClaim(ctx, client, pvclaim.Name, namespace) + err = fpv.DeletePersistentVolumeClaim(ctx, client, pvclaim.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + ginkgo.By("Verify PVs, volumes are deleted from CNS") + for _, pv := range persistentvolumes { + err := fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, + pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + volumeHandle := pv.Spec.CSI.VolumeHandle + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - pvclaim = nil } - }() - - ginkgo.By("Wait for k8s cluster to be healthy") - wait4AllK8sNodesToBeUp(ctx, client, nodeList) - err = waitForAllNodes2BeReady(ctx, client) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - - // Check if csi pods are running fine after site failure - err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout*2) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - - persistentvolumes, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaims, framework.ClaimProvisionTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - for i := 0; i < volumeOpsScale; i++ { - volHandle := persistentvolumes[i].Spec.CSI.VolumeHandle - gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) - } - - for _, pvclaim := range pvclaims { - err = fpv.DeletePersistentVolumeClaim(ctx, client, pvclaim.Name, namespace) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } - ginkgo.By("Verify PVs, volumes are deleted from CNS") - for _, pv := range persistentvolumes { - err := fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, - pollTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - volumeHandle := pv.Spec.CSI.VolumeHandle - err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } - // TODO: List orphan volumes - - ginkgo.By("Bring up the secondary site") - siteRestore(false) - ginkgo.By("Wait for k8s cluster to be healthy") - // wait for the VMs to move back - err = waitForAllNodes2BeReady(ctx, client) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }) + ginkgo.By("Bring up the primary site") + siteRestore(true) + }) /* - PVC deletion while secondary site goes down and csi provisioner leader is in secondary site + PVC creation while secondary site goes down and csi provisioner leader is in secondary site Steps: 1. Configure a vanilla multi-master K8s cluster with inter and intra site replication 2. Ensure csi-provisioner leader is in secondary site - 3. Create 30 PVCs and wait for each PVC binding with a PV - 4. Delete PVCs created in step 3 - 5. Bring down secondary site - 6. Verify that the VMs on the secondary site are started up on the other esx servers in the primary site - 7. Verify PV and CNS volumes associated with PVC created in step 2 are also deleted successfully - 8. Bring secondary site up and wait for testbed to be back to normal - + 3. Create 30 PVCs using a thick provision policy so that it takes some time for PVC creation to go through + 4. Bring down secondary site + 5. Verify that the VMs on the secondary site are started up on the other esx servers in the primary site + 6. Verify that the PVCs created in step 3 is bound successfully + 7. Bring secondary site up and wait for testbed to be back to normal + 9. Delete PVC created in step 3 + 10. If there is an orphan volume clean up that using cnsctl */ - ginkgo.It("[distributed] PVC deletion while secondary site goes down"+ - " and csi provisioner leader is in secondary site", func() { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - ginkgo.By("Creating StorageClass") - // decide which test setup is available to run - ginkgo.By("CNS_TEST: Running for vanilla k8s setup") - scParameters = map[string]string{} - scParameters["StoragePolicyName"] = storagePolicyName - storageClassName = "nginx-sc-default" - var pvclaims []*v1.PersistentVolumeClaim - - scSpec := getVSphereStorageClassSpec(storageClassName, scParameters, nil, "", "", false) - sc, err := client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - defer func() { - err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }() + ginkgo.It("PVC creation while secondary site goes down"+ + " and csi provisioner leader is in secondary site", + ginkgo.Label(p0, vsanStretch, block, vanilla, distributed), func() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + ginkgo.By("Creating StorageClass") + // decide which test setup is available to run + ginkgo.By("CNS_TEST: Running for vanilla k8s setup") + scParameters = map[string]string{} + scParameters["StoragePolicyName"] = storageThickPolicyName + storageClassName = "nginx-sc-thick" + var pvclaims []*v1.PersistentVolumeClaim - framework.Logf("Ensuring %s leader is in secondary site", provisionerContainerName) - err = changeLeaderOfContainerToComeUpOnMaster(ctx, client, sshClientConfig, provisionerContainerName, false) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + scSpec := getVSphereStorageClassSpec(storageClassName, scParameters, nil, "", "", false) + sc, err := client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + defer func() { + err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() - for i := 0; i < volumeOpsScale; i++ { - framework.Logf("Creating pvc %v", i) - pvc, err := createPVC(ctx, client, namespace, nil, diskSize, sc, "") + framework.Logf("Ensuring %s leader is in secondary site", provisionerContainerName) + err = changeLeaderOfContainerToComeUpOnMaster(ctx, client, sshClientConfig, provisionerContainerName, false) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - pvclaims = append(pvclaims, pvc) - } - persistentvolumes, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaims, framework.ClaimProvisionTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - for i := 0; i < volumeOpsScale; i++ { - volHandle := persistentvolumes[i].Spec.CSI.VolumeHandle - gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) - } + // Get the list of csi pods running in CSI namespace + csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) - // Get the list of csi pods running in CSI namespace - csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Bring down the secondary site while creating pvcs") + var wg sync.WaitGroup + ch := make(chan *v1.PersistentVolumeClaim) + lock := &sync.Mutex{} + wg.Add(2) + go createPvcInParallel(ctx, client, namespace, diskSize, sc, ch, lock, &wg, volumeOpsScale) + go func() { + for v := range ch { + pvclaims = append(pvclaims, v) + } + }() + go siteFailureInParallel(ctx, false, &wg) + wg.Wait() + close(ch) - ginkgo.By("Bring down the secondary site while deleting pvcs") - var wg sync.WaitGroup - wg.Add(2) - go deletePvcInParallel(ctx, client, pvclaims, namespace, &wg) - go siteFailureInParallel(ctx, false, &wg) - wg.Wait() + defer func() { + for _, pvclaim := range pvclaims { + err := fpv.DeletePersistentVolumeClaim(ctx, client, pvclaim.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + pvclaim = nil + } + }() - defer func() { - ginkgo.By("Bring up the secondary site before terminating the test") - if len(fds.hostsDown) > 0 { - siteRestore(false) - fds.hostsDown = []string{} - } - }() + ginkgo.By("Wait for k8s cluster to be healthy") + wait4AllK8sNodesToBeUp(nodeList) + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) - ginkgo.By("Wait for k8s cluster to be healthy") - wait4AllK8sNodesToBeUp(ctx, client, nodeList) - err = waitForAllNodes2BeReady(ctx, client) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + // Check if csi pods are running fine after site failure + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout*2) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) - // Check if csi pods are running fine after site failure - err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout*2) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + persistentvolumes, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaims, framework.ClaimProvisionTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for i := 0; i < volumeOpsScale; i++ { + volHandle := persistentvolumes[i].Spec.CSI.VolumeHandle + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + } - ginkgo.By("Verify PVs, volumes are deleted from CNS") - for _, pv := range persistentvolumes { - volumeHandle := pv.Spec.CSI.VolumeHandle - err := fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, - pollTimeout) - errMsg := "The object or item referred to could not be found" - if err != nil && checkForEventWithMessage(client, "", pv.Name, errMsg) { - framework.Logf("Persistent Volume %v still not deleted with err %v", pv.Name, errMsg) - // Orphan volumes may be left over here, hence logging those PVs and ignoring the error for now. - _ = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) - framework.Logf("Volume %v still not deleted from CNS with err %v", pv.Name, errMsg) - } else { + for _, pvclaim := range pvclaims { + err = fpv.DeletePersistentVolumeClaim(ctx, client, pvclaim.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + ginkgo.By("Verify PVs, volumes are deleted from CNS") + for _, pv := range persistentvolumes { + err := fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, + pollTimeout) gomega.Expect(err).NotTo(gomega.HaveOccurred()) + volumeHandle := pv.Spec.CSI.VolumeHandle err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - // TODO: List orphan volumes } - } - ginkgo.By("Bring up the secondary site") - if len(fds.hostsDown) > 0 { - siteRestore(false) - fds.hostsDown = []string{} - } + // TODO: List orphan volumes - siteRestore(true) + ginkgo.By("Bring up the secondary site") + siteRestore(false) - ginkgo.By("Wait for k8s cluster to be healthy") - // wait for the VMs to move back - err = waitForAllNodes2BeReady(ctx, client) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }) + ginkgo.By("Wait for k8s cluster to be healthy") + // wait for the VMs to move back + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }) /* - Pod creation while secondary site goes down and csi attacher leader is in secondary site + PVC deletion while secondary site goes down and csi provisioner leader is in secondary site Steps: 1. Configure a vanilla multi-master K8s cluster with inter and intra site replication - 2. Ensure csi-attacher leader is in secondary site + 2. Ensure csi-provisioner leader is in secondary site 3. Create 30 PVCs and wait for each PVC binding with a PV - 4. Create PODs using PVCs created in step 3 + 4. Delete PVCs created in step 3 5. Bring down secondary site 6. Verify that the VMs on the secondary site are started up on the other esx servers in the primary site - 7. Verify that the pods created in step 4 come up successfully - 8. Delete pods created in step 4 and 8 - 9. Bring secondary site up and wait for testbed to be back to normal - 10. Delete PVCs created in step 2 + 7. Verify PV and CNS volumes associated with PVC created in step 2 are also deleted successfully + 8. Bring secondary site up and wait for testbed to be back to normal */ - ginkgo.It("[distributed] Pod creation while secondary site goes down"+ - " and csi attacher leader is in secondary site", func() { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - ginkgo.By("Creating StorageClass") - // decide which test setup is available to run - ginkgo.By("CNS_TEST: Running for vanilla k8s setup") - scParameters = map[string]string{} - scParameters["StoragePolicyName"] = storagePolicyName - storageClassName = "nginx-sc-default" - var pvclaims []*v1.PersistentVolumeClaim - var pods []*v1.Pod - - scSpec := getVSphereStorageClassSpec(storageClassName, scParameters, nil, "", "", false) - sc, err := client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - defer func() { - err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }() + ginkgo.It("PVC deletion while secondary site goes down"+ + " and csi provisioner leader is in secondary site", + ginkgo.Label(p0, vsanStretch, block, vanilla, distributed), func() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + ginkgo.By("Creating StorageClass") + // decide which test setup is available to run + ginkgo.By("CNS_TEST: Running for vanilla k8s setup") + scParameters = map[string]string{} + scParameters["StoragePolicyName"] = storagePolicyName + storageClassName = "nginx-sc-default" + var pvclaims []*v1.PersistentVolumeClaim - framework.Logf("Ensuring %s leader is in secondary site", attacherContainerName) - err = changeLeaderOfContainerToComeUpOnMaster(ctx, client, sshClientConfig, attacherContainerName, false) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + scSpec := getVSphereStorageClassSpec(storageClassName, scParameters, nil, "", "", false) + sc, err := client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + defer func() { + err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() - for i := 0; i < volumeOpsScale; i++ { - framework.Logf("Creating pvc %v", i) - pvc, err := createPVC(ctx, client, namespace, nil, diskSize, sc, "") + framework.Logf("Ensuring %s leader is in secondary site", provisionerContainerName) + err = changeLeaderOfContainerToComeUpOnMaster(ctx, client, sshClientConfig, provisionerContainerName, false) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - pvclaims = append(pvclaims, pvc) - } - defer func() { - for _, pvclaim := range pvclaims { - err := fpv.DeletePersistentVolumeClaim(ctx, client, pvclaim.Name, namespace) + for i := 0; i < volumeOpsScale; i++ { + framework.Logf("Creating pvc %v", i) + pvc, err := createPVC(ctx, client, namespace, nil, diskSize, sc, "") gomega.Expect(err).NotTo(gomega.HaveOccurred()) - pvclaim = nil + pvclaims = append(pvclaims, pvc) } - }() - persistentvolumes, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaims, framework.ClaimProvisionTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - for i := 0; i < volumeOpsScale; i++ { - volHandle := persistentvolumes[i].Spec.CSI.VolumeHandle - gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) - } - defer func() { - for _, claim := range pvclaims { - err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + persistentvolumes, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaims, framework.ClaimProvisionTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for i := 0; i < volumeOpsScale; i++ { + volHandle := persistentvolumes[i].Spec.CSI.VolumeHandle + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) } + + // Get the list of csi pods running in CSI namespace + csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Bring down the secondary site while deleting pvcs") + var wg sync.WaitGroup + wg.Add(2) + go deletePvcInParallel(ctx, client, pvclaims, namespace, &wg) + go siteFailureInParallel(ctx, false, &wg) + wg.Wait() + + defer func() { + ginkgo.By("Bring up the secondary site before terminating the test") + if len(fds.hostsDown) > 0 { + siteRestore(false) + fds.hostsDown = []string{} + } + }() + + ginkgo.By("Wait for k8s cluster to be healthy") + wait4AllK8sNodesToBeUp(nodeList) + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + // Check if csi pods are running fine after site failure + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout*2) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Verify PVs, volumes are deleted from CNS") for _, pv := range persistentvolumes { volumeHandle := pv.Spec.CSI.VolumeHandle @@ -1738,219 +1764,319 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f gomega.Expect(err).NotTo(gomega.HaveOccurred()) err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) gomega.Expect(err).NotTo(gomega.HaveOccurred()) + // TODO: List orphan volumes } } - }() - - /// Get the list of csi pods running in CSI namespace - csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - - ginkgo.By("Bring down the secondary site while creating pods") - var wg sync.WaitGroup - wg.Add(2) - ch := make(chan *v1.Pod) - lock := &sync.Mutex{} - go createPodsInParallel(client, namespace, pvclaims, ctx, lock, ch, &wg, volumeOpsScale) - go func() { - for v := range ch { - pods = append(pods, v) - } - }() - go siteFailureInParallel(ctx, false, &wg) - wg.Wait() - close(ch) - - defer func() { - ginkgo.By("Bring up the secondary site before terminating the test") + ginkgo.By("Bring up the secondary site") if len(fds.hostsDown) > 0 { siteRestore(false) fds.hostsDown = []string{} } - }() - - ginkgo.By("Wait for k8s cluster to be healthy") - wait4AllK8sNodesToBeUp(ctx, client, nodeList) - err = waitForAllNodes2BeReady(ctx, client) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - - // Check if csi pods are running fine after site failure - err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout*2) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - - ginkgo.By("Checking whether pods are in Running or ExitCode:0 state") - for _, pod := range pods { - framework.Logf("Pod is %s", pod.Name) - err = waitForPodsToBeInErrorOrRunning(ctx, client, pod.Name, namespace, pollTimeout*4) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } - - ginkgo.By("Bring up the secondary site") - if len(fds.hostsDown) > 0 { - siteRestore(false) - fds.hostsDown = []string{} - } - ginkgo.By("Wait for k8s cluster to be healthy") - // wait for the VMs to move back - err = waitForAllNodes2BeReady(ctx, client) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + siteRestore(true) - for _, pod := range pods { - ginkgo.By(fmt.Sprintf("Deleting the pod %s in namespace %s", pod.Name, namespace)) - err = fpod.DeletePodWithWait(ctx, client, pod) + ginkgo.By("Wait for k8s cluster to be healthy") + // wait for the VMs to move back + err = waitForAllNodes2BeReady(ctx, client) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } - }) + }) /* - Pod deletion while secondary site goes down and csi attacher leader is in secondary site + Pod creation while secondary site goes down and csi attacher leader is in secondary site Steps: 1. Configure a vanilla multi-master K8s cluster with inter and intra site replication 2. Ensure csi-attacher leader is in secondary site 3. Create 30 PVCs and wait for each PVC binding with a PV 4. Create PODs using PVCs created in step 3 - 5. Delete pods created in step 4 - 6. Bring down secondary site - 7. Verify that the VMs on the secondary site are started up on the other esx servers in the primary site - 8. Verify that the pods get deleted successfully - 9. Verify volumeattachments are also deleted - 10. Verify CNS volume metadata for the volumes - 11. Bring secondary site up and wait for testbed to be back to normal - 12. Delete PVCs created in step 2 + 5. Bring down secondary site + 6. Verify that the VMs on the secondary site are started up on the other esx servers in the primary site + 7. Verify that the pods created in step 4 come up successfully + 8. Delete pods created in step 4 and 8 + 9. Bring secondary site up and wait for testbed to be back to normal + 10. Delete PVCs created in step 2 */ - ginkgo.It("[distributed] Pod deletion while secondary site goes down"+ - " and csi attacher leader is in secondary site", func() { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - ginkgo.By("Creating StorageClass") - // decide which test setup is available to run - ginkgo.By("CNS_TEST: Running for vanilla k8s setup") - scParameters = map[string]string{} - scParameters["StoragePolicyName"] = storagePolicyName - storageClassName = "nginx-sc-default" - var pvclaims []*v1.PersistentVolumeClaim - var pods []*v1.Pod - - scSpec := getVSphereStorageClassSpec(storageClassName, scParameters, nil, "", "", false) - sc, err := client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - defer func() { - err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }() + ginkgo.It("Pod creation while secondary site goes down"+ + " and csi attacher leader is in secondary site", + ginkgo.Label(p0, vsanStretch, block, vanilla, distributed), func() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + ginkgo.By("Creating StorageClass") + // decide which test setup is available to run + ginkgo.By("CNS_TEST: Running for vanilla k8s setup") + scParameters = map[string]string{} + scParameters["StoragePolicyName"] = storagePolicyName + storageClassName = "nginx-sc-default" + var pvclaims []*v1.PersistentVolumeClaim + var pods []*v1.Pod - framework.Logf("Ensuring %s leader is in secondary site", attacherContainerName) - err = changeLeaderOfContainerToComeUpOnMaster(ctx, client, sshClientConfig, attacherContainerName, false) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + scSpec := getVSphereStorageClassSpec(storageClassName, scParameters, nil, "", "", false) + sc, err := client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + defer func() { + err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() - for i := 0; i < volumeOpsScale; i++ { - framework.Logf("Creating pvc") - pvc, err := createPVC(ctx, client, namespace, nil, diskSize, sc, "") + framework.Logf("Ensuring %s leader is in secondary site", attacherContainerName) + err = changeLeaderOfContainerToComeUpOnMaster(ctx, client, sshClientConfig, attacherContainerName, false) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - pvclaims = append(pvclaims, pvc) - } - persistentvolumes, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaims, framework.ClaimProvisionTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - for i := 0; i < volumeOpsScale; i++ { - volHandle := persistentvolumes[i].Spec.CSI.VolumeHandle - gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) - } - defer func() { - for _, claim := range pvclaims { - err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) + for i := 0; i < volumeOpsScale; i++ { + framework.Logf("Creating pvc %v", i) + pvc, err := createPVC(ctx, client, namespace, nil, diskSize, sc, "") gomega.Expect(err).NotTo(gomega.HaveOccurred()) + pvclaims = append(pvclaims, pvc) } - ginkgo.By("Verify PVs, volumes are deleted from CNS") - for _, pv := range persistentvolumes { - err := fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, - pollTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - volumeHandle := pv.Spec.CSI.VolumeHandle - err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) - gomega.Expect(err).NotTo(gomega.HaveOccurred(), - fmt.Sprintf("Volume: %s should not be present in the CNS after it is deleted from "+ - "kubernetes", volumeHandle)) + + defer func() { + for _, pvclaim := range pvclaims { + err := fpv.DeletePersistentVolumeClaim(ctx, client, pvclaim.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + pvclaim = nil + } + }() + + persistentvolumes, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaims, framework.ClaimProvisionTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for i := 0; i < volumeOpsScale; i++ { + volHandle := persistentvolumes[i].Spec.CSI.VolumeHandle + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) } - }() + defer func() { + for _, claim := range pvclaims { + err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + ginkgo.By("Verify PVs, volumes are deleted from CNS") + for _, pv := range persistentvolumes { + volumeHandle := pv.Spec.CSI.VolumeHandle + err := fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, + pollTimeout) + errMsg := "The object or item referred to could not be found" + if err != nil && checkForEventWithMessage(client, "", pv.Name, errMsg) { + framework.Logf("Persistent Volume %v still not deleted with err %v", pv.Name, errMsg) + // Orphan volumes may be left over here, hence logging those PVs and ignoring the error for now. + _ = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + framework.Logf("Volume %v still not deleted from CNS with err %v", pv.Name, errMsg) + } else { + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + } + }() - ginkgo.By("Create pods") - for i := 0; i < volumeOpsScale; i++ { - pod, err := createPod(ctx, client, namespace, - nil, []*v1.PersistentVolumeClaim{pvclaims[i]}, - false, execCommand) - framework.Logf("Created pod %s", pod.Name) + /// Get the list of csi pods running in CSI namespace + csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - pods = append(pods, pod) - } - defer func() { + + ginkgo.By("Bring down the secondary site while creating pods") + var wg sync.WaitGroup + wg.Add(2) + ch := make(chan *v1.Pod) + lock := &sync.Mutex{} + go createPodsInParallel(client, namespace, pvclaims, ctx, lock, ch, &wg, volumeOpsScale) + go func() { + for v := range ch { + pods = append(pods, v) + } + }() + go siteFailureInParallel(ctx, false, &wg) + wg.Wait() + close(ch) + + defer func() { + ginkgo.By("Bring up the secondary site before terminating the test") + if len(fds.hostsDown) > 0 { + siteRestore(false) + fds.hostsDown = []string{} + } + }() + + ginkgo.By("Wait for k8s cluster to be healthy") + wait4AllK8sNodesToBeUp(nodeList) + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + // Check if csi pods are running fine after site failure + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout*2) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Checking whether pods are in Running or ExitCode:0 state") for _, pod := range pods { - err = fpod.DeletePodWithWait(ctx, client, pod) + framework.Logf("Pod is %s", pod.Name) + err = waitForPodsToBeInErrorOrRunning(ctx, client, pod.Name, namespace, pollTimeout*4) gomega.Expect(err).NotTo(gomega.HaveOccurred()) } - }() - - // Get the list of csi pods running in CSI namespace - csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - ginkgo.By("Bring down the secondary site while deleting pods") - var wg sync.WaitGroup - wg.Add(2) - go deletePodsInParallel(ctx, client, namespace, pods, &wg) - go siteFailureInParallel(ctx, false, &wg) - wg.Wait() - - defer func() { - ginkgo.By("Bring up the secondary site before terminating the test") + ginkgo.By("Bring up the secondary site") if len(fds.hostsDown) > 0 { siteRestore(false) fds.hostsDown = []string{} } - }() - ginkgo.By("Wait for k8s cluster to be healthy") - wait4AllK8sNodesToBeUp(ctx, client, nodeList) - err = waitForAllNodes2BeReady(ctx, client) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Wait for k8s cluster to be healthy") + // wait for the VMs to move back + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) - // Check if csi pods are running fine after site failure - err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout*2) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for _, pod := range pods { + ginkgo.By(fmt.Sprintf("Deleting the pod %s in namespace %s", pod.Name, namespace)) + err = fpod.DeletePodWithWait(ctx, client, pod) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + }) + + /* + Pod deletion while secondary site goes down and csi attacher leader is in secondary site + Steps: + 1. Configure a vanilla multi-master K8s cluster with inter and intra site replication + 2. Ensure csi-attacher leader is in secondary site + 3. Create 30 PVCs and wait for each PVC binding with a PV + 4. Create PODs using PVCs created in step 3 + 5. Delete pods created in step 4 + 6. Bring down secondary site + 7. Verify that the VMs on the secondary site are started up on the other esx servers in the primary site + 8. Verify that the pods get deleted successfully + 9. Verify volumeattachments are also deleted + 10. Verify CNS volume metadata for the volumes + 11. Bring secondary site up and wait for testbed to be back to normal + 12. Delete PVCs created in step 2 - for _, pod := range pods { - framework.Logf("Wait up to %v for pod %q to be fully deleted", pollTimeout, pod.Name) - err = fpod.WaitForPodNotFoundInNamespace(ctx, client, pod.Name, namespace, pollTimeout) + */ + ginkgo.It("Pod deletion while secondary site goes down"+ + " and csi attacher leader is in secondary site", + ginkgo.Label(p0, vsanStretch, block, vanilla, distributed), func() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + ginkgo.By("Creating StorageClass") + // decide which test setup is available to run + ginkgo.By("CNS_TEST: Running for vanilla k8s setup") + scParameters = map[string]string{} + scParameters["StoragePolicyName"] = storagePolicyName + storageClassName = "nginx-sc-default" + var pvclaims []*v1.PersistentVolumeClaim + var pods []*v1.Pod + + scSpec := getVSphereStorageClassSpec(storageClassName, scParameters, nil, "", "", false) + sc, err := client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } + defer func() { + err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() - ginkgo.By("Verify volume is detached from the node") - for i := 0; i < volumeOpsScale; i++ { - volHandle := persistentvolumes[i].Spec.CSI.VolumeHandle - isDiskDetached, err := e2eVSphere.waitForVolumeDetachedFromNode(client, volHandle, pods[i].Spec.NodeName) + framework.Logf("Ensuring %s leader is in secondary site", attacherContainerName) + err = changeLeaderOfContainerToComeUpOnMaster(ctx, client, sshClientConfig, attacherContainerName, false) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - gomega.Expect(isDiskDetached).To(gomega.BeTrue(), - fmt.Sprintf("Volume %q is not detached from the node %q", volHandle, pods[i].Spec.NodeName)) - } - ginkgo.By("Bring up the secondary site") - if len(fds.hostsDown) > 0 { - siteRestore(false) - fds.hostsDown = []string{} - } + for i := 0; i < volumeOpsScale; i++ { + framework.Logf("Creating pvc") + pvc, err := createPVC(ctx, client, namespace, nil, diskSize, sc, "") + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + pvclaims = append(pvclaims, pvc) + } - ginkgo.By("Wait for k8s cluster to be healthy") - // wait for the VMs to move back - err = waitForAllNodes2BeReady(ctx, client) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }) + persistentvolumes, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaims, framework.ClaimProvisionTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for i := 0; i < volumeOpsScale; i++ { + volHandle := persistentvolumes[i].Spec.CSI.VolumeHandle + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + } + defer func() { + for _, claim := range pvclaims { + err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + ginkgo.By("Verify PVs, volumes are deleted from CNS") + for _, pv := range persistentvolumes { + err := fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, + pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + volumeHandle := pv.Spec.CSI.VolumeHandle + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred(), + fmt.Sprintf("Volume: %s should not be present in the CNS after it is deleted from "+ + "kubernetes", volumeHandle)) + } + }() + + ginkgo.By("Create pods") + for i := 0; i < volumeOpsScale; i++ { + pod, err := createPod(ctx, client, namespace, + nil, []*v1.PersistentVolumeClaim{pvclaims[i]}, + false, execCommand) + framework.Logf("Created pod %s", pod.Name) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + pods = append(pods, pod) + } + defer func() { + for _, pod := range pods { + err = fpod.DeletePodWithWait(ctx, client, pod) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + }() + + // Get the list of csi pods running in CSI namespace + csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Bring down the secondary site while deleting pods") + var wg sync.WaitGroup + wg.Add(2) + go deletePodsInParallel(ctx, client, namespace, pods, &wg) + go siteFailureInParallel(ctx, false, &wg) + wg.Wait() + + defer func() { + ginkgo.By("Bring up the secondary site before terminating the test") + if len(fds.hostsDown) > 0 { + siteRestore(false) + fds.hostsDown = []string{} + } + }() + + ginkgo.By("Wait for k8s cluster to be healthy") + wait4AllK8sNodesToBeUp(nodeList) + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + // Check if csi pods are running fine after site failure + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout*2) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + for _, pod := range pods { + framework.Logf("Wait up to %v for pod %q to be fully deleted", pollTimeout, pod.Name) + err = fpod.WaitForPodNotFoundInNamespace(ctx, client, pod.Name, namespace, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + ginkgo.By("Verify volume is detached from the node") + for i := 0; i < volumeOpsScale; i++ { + volHandle := persistentvolumes[i].Spec.CSI.VolumeHandle + isDiskDetached, err := e2eVSphere.waitForVolumeDetachedFromNode(client, volHandle, pods[i].Spec.NodeName) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + gomega.Expect(isDiskDetached).To(gomega.BeTrue(), + fmt.Sprintf("Volume %q is not detached from the node %q", volHandle, pods[i].Spec.NodeName)) + + } + ginkgo.By("Bring up the secondary site") + if len(fds.hostsDown) > 0 { + siteRestore(false) + fds.hostsDown = []string{} + } + + ginkgo.By("Wait for k8s cluster to be healthy") + // wait for the VMs to move back + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }) /* Secondary site down Steps: - 1. Configure a vanilla multi-master K8s cluster with inter and intra site replication + 1. Configure a vsan stretched cluster testbed. 2. Create a statefulset, deployment with volumes from the stretched datastore 3. Bring down the secondary site 4. Verify that the VMs hosted by esx servers are brought up on the other site @@ -1959,112 +2085,170 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f 6. Bring secondary site up and wait for testbed to be back to normal 7. Delete all objects created in step 2 and 5 */ - ginkgo.It("[control-plane-on-primary] Secondary site down", func() { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - ginkgo.By("Creating StorageClass") - // decide which test setup is available to run - ginkgo.By("CNS_TEST: Running for vanilla k8s setup") - scParameters = map[string]string{} - scParameters["StoragePolicyName"] = storagePolicyName - storageClassName = "nginx-sc-default" - - scSpec := getVSphereStorageClassSpec(storageClassName, scParameters, nil, "", "", false) - sc, err := client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - defer func() { - err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }() - framework.Logf("Creating service") - service := CreateService(namespace, client) - defer func() { - deleteService(namespace, client, service) - }() + ginkgo.It("Secondary site down", + ginkgo.Label(p0, vsanStretch, block, vanilla, wcp, tkg, controlPlaneOnPrimary), func() { - ginkgo.By("Creating statefulset and deployment with volumes from the stretched datastore") - statefulset, _, _ := createStsDeployment(ctx, client, namespace, sc, true, - false, 3, "", 1, "") + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() - ssPodsBeforeScaleDown := fss.GetPodList(ctx, client, statefulset) - replicas := *(statefulset.Spec.Replicas) + var stsReplicas, depReplicaCount int32 + var statefulset *appsv1.StatefulSet + var svcCsipods, csipods *v1.PodList - defer func() { - pvcs, err := client.CoreV1().PersistentVolumeClaims(namespace).List(ctx, metav1.ListOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - for _, claim := range pvcs.Items { - pv := getPvFromClaim(client, namespace, claim.Name) - err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) + if rwxAccessMode { + depReplicaCount = 3 + stsReplicas = 3 + } else { + depReplicaCount = 1 + stsReplicas = 4 + } + + ginkgo.By("Creating StorageClass") + if vanillaCluster { + ginkgo.By("CNS_TEST: Running for vanilla k8s setup") + scParameters = map[string]string{} + scParameters["StoragePolicyName"] = storagePolicyName + scSpec := getVSphereStorageClassSpec(defaultNginxStorageClassName, scParameters, nil, "", "", false) + sc, err = client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - volumeHandle := pv.Spec.CSI.VolumeHandle - ginkgo.By("Verify it's PV and corresponding volumes are deleted from CNS") - err = fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, - pollTimeout) - errMsg := "The object or item referred to could not be found" - if err != nil && checkForEventWithMessage(client, "", pv.Name, errMsg) { - framework.Logf("Persistent Volume %v still not deleted with err %v", pv.Name, errMsg) - // Orphan volumes may be left over here, hence logging those PVs and ignoring the error for now. - _ = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) - framework.Logf("Volume %v still not deleted from CNS with err %v", pv.Name, errMsg) - } else { + defer func() { + err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + }() + } else if guestCluster { + ginkgo.By("CNS_TEST: Running for GC setup") + sc, err = client.StorageV1().StorageClasses().Get(ctx, storagePolicyName, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + ginkgo.By("Creating service") + service := CreateService(namespace, client) + defer func() { + deleteService(namespace, client, service) + }() + + ginkgo.By("Creating statefulset and deployment with volumes from the stretched datastore") + statefulset, deployment, _ := createStsDeployment(ctx, client, namespace, sc, true, + false, stsReplicas, "", depReplicaCount, accessMode) + ssPodsBeforeScaleDown := fss.GetPodList(ctx, client, statefulset) + defer func() { + scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) + pvcs, err := client.CoreV1().PersistentVolumeClaims(namespace).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for _, claim := range pvcs.Items { + pv := getPvFromClaim(client, namespace, claim.Name) + err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Verify it's PV and corresponding volumes are deleted from CNS") + err = fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, + pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + volumeHandle := pv.Spec.CSI.VolumeHandle + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred(), + fmt.Sprintf("Volume: %s should not be present in the CNS after it is deleted from "+ + "kubernetes", volumeHandle)) } + }() + + if guestCluster { + svcCsipods, err = svcClient.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) } - }() - // Get the list of csi pods running in CSI namespace - csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + csipods, err = client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) - ginkgo.By("Bring down the secondary site") - siteFailover(ctx, false) + ginkgo.By("Bring down the secondary site") + siteFailover(ctx, false) + defer func() { + ginkgo.By("Bring up the secondary site before terminating the test") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + siteRestore(false) + fds.hostsDown = nil + } + }() - defer func() { - ginkgo.By("Bring up the secondary site before terminating the test") + if vanillaCluster { + wait4AllK8sNodesToBeUp(nodeList) + } + if vanillaCluster || guestCluster { + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + if guestCluster { + ginkgo.By("Check if csi pods are running fine after site failurein supervisor") + err = fpod.WaitForPodsRunningReady(ctx, svcClient, csiNs, int32(svcCsipods.Size()), 0, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Verifying volume lifecycle actions works fine") + volumeLifecycleActions(ctx, client, namespace, sc, "") + + // Statefulset and deployments in PodVM might got to Terminating state as + // the nodes attached to these pods might become inaccessible during site failure. + // Hence validating these steps once site is restored back. + if !supervisorCluster { + ginkgo.By("Performing scaledown operation on statefulset when site is down") + stsReplicas = 2 + scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset, + ssPodsBeforeScaleDown, stsReplicas, true, true) + + ginkgo.By("Performing scaleup operation on statefulset when site is down") + stsReplicas = 6 + scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset, + stsReplicas, true, true) + + if rwxAccessMode { + ginkgo.By("Performing scaleup operation on deployment when site is down") + depReplicaCount = 4 + updateDeploymentReplicawithWait(client, depReplicaCount, deployment.Name, namespace) + verifyVolumeMetadataOnDeployments(ctx, client, deployment, namespace, nil, nil, + nil, "") + } + } + + ginkgo.By("Bring up the secondary site") if len(fds.hostsDown) > 0 && fds.hostsDown != nil { siteRestore(false) fds.hostsDown = nil } - }() - ginkgo.By("Wait for k8s cluster to be healthy") - wait4AllK8sNodesToBeUp(ctx, client, nodeList) - err = waitForAllNodes2BeReady(ctx, client) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - - // Check if csi pods are running fine after site failure - err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) - ginkgo.By("Verifying volume lifecycle actions works fine") - volumeLifecycleActions(ctx, client, namespace, sc, "") - // Scale down replicas of statefulset and verify CNS entries for volumes - scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset, - ssPodsBeforeScaleDown, replicas-1, true, true) - // Scale up replicas of statefulset and verify CNS entries for volumes - scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset, - replicas, true, true) - - ginkgo.By("Bring up the secondary site") - if len(fds.hostsDown) > 0 && fds.hostsDown != nil { - siteRestore(false) - fds.hostsDown = nil - } + if supervisorCluster { + ginkgo.By("Performing scaledown operation on statefulset when site is down") + stsReplicas = 2 + scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset, + ssPodsBeforeScaleDown, stsReplicas, true, true) - ginkgo.By("Wait for k8s cluster to be healthy") - // wait for the VMs to move back - err = waitForAllNodes2BeReady(ctx, client) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Performing scaleup operation on statefulset when site is down") + stsReplicas = 6 + scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset, + stsReplicas, true, true) + + if rwxAccessMode { + ginkgo.By("Performing scaleup operation on deployment when site is down") + depReplicaCount = 4 + updateDeploymentReplicawithWait(client, depReplicaCount, deployment.Name, namespace) + verifyVolumeMetadataOnDeployments(ctx, client, deployment, namespace, nil, nil, + nil, "") + } + } - scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) - }) + ginkgo.By("Scale down statefulset and deployment after site recovery") + scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) + }) /* Network failure between sites Steps: - 1. Configure a vanilla multi-master K8s cluster with inter and intra site replication + 1. Configure a vsan stretched cluster testbed. 2. create a statefulsets sts1 with replica count 1 and sts2 with 5 and wait for all the replicas to be running 3. Change replica counts of sts1 ans sts 2 to 3 replicas @@ -2080,187 +2264,190 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f volume and application lifecycle actions work fine 11. Cleanup all objects created so far in the test */ - ginkgo.It("[control-plane-on-primary][distributed]"+ - "[csi-vsan-stretch-wcp][csi-vsan-stretch-tkg] Network failure between sites", func() { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - ginkgo.By("Creating StorageClass for Statefulset") - var sts1Replicas, sts2Replicas, dep1ReplicaCount, dep2ReplicaCount int32 - var statefulset1, statefulset2 *appsv1.StatefulSet - var deployment1, deployment2 *appsv1.Deployment - var err error - var svcCsipods *v1.PodList + ginkgo.It("Network failure between sites", + ginkgo.Label(p0, vsanStretch, block, vanilla, wcp, tkg, + controlPlaneOnPrimary, distributed), func() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + ginkgo.By("Creating StorageClass for Statefulset") + var sts1Replicas, sts2Replicas, dep1ReplicaCount, dep2ReplicaCount int32 + var statefulset1, statefulset2 *appsv1.StatefulSet + var deployment1, deployment2 *appsv1.Deployment + var err error + var svcCsipods *v1.PodList + + if vanillaCluster { + scParameters = map[string]string{} + scParameters["StoragePolicyName"] = storagePolicyName + scSpec := getVSphereStorageClassSpec(defaultNginxStorageClassName, scParameters, nil, "", "", false) + sc, err = client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + defer func() { + err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + } else if guestCluster { + ginkgo.By("CNS_TEST: Running for GC setup") + sc, err = client.StorageV1().StorageClasses().Get(ctx, storagePolicyName, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) - if vanillaCluster { - scParameters = map[string]string{} - scParameters["StoragePolicyName"] = storagePolicyName - scSpec := getVSphereStorageClassSpec(defaultNginxStorageClassName, scParameters, nil, "", "", false) - sc, err = client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + framework.Logf("Creating service") + service := CreateService(namespace, client) defer func() { - err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + deleteService(namespace, client, service) }() - } else if guestCluster { - ginkgo.By("CNS_TEST: Running for GC setup") - sc, err = client.StorageV1().StorageClasses().Get(ctx, storagePolicyName, metav1.GetOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } + if rwxAccessMode { + dep1ReplicaCount = 1 + dep2ReplicaCount = 5 + } else { + dep1ReplicaCount = 1 + dep2ReplicaCount = 1 + } + sts1Replicas = 1 + sts2Replicas = 2 - framework.Logf("Creating service") - service := CreateService(namespace, client) - defer func() { - deleteService(namespace, client, service) - }() + ginkgo.By("Creating statefulsets sts1 with replica count 1 and sts2 with 5 and wait for all" + + "the replicas to be running") - if rwxAccessMode { - dep1ReplicaCount = 1 - dep2ReplicaCount = 5 - } else { - dep1ReplicaCount = 1 - dep2ReplicaCount = 1 - } - sts1Replicas = 1 - sts2Replicas = 2 - - ginkgo.By("Creating statefulsets sts1 with replica count 1 and sts2 with 5 and wait for all" + - "the replicas to be running") + statefulset1, deployment1, _ = createStsDeployment(ctx, client, namespace, sc, + true, true, sts1Replicas, "web", dep1ReplicaCount, accessMode) - statefulset1, deployment1, _ = createStsDeployment(ctx, client, namespace, sc, - true, true, sts1Replicas, "web", dep1ReplicaCount, accessMode) + statefulset2, deployment2, _ = createStsDeployment(ctx, client, namespace, sc, + true, true, sts2Replicas, "web-nginx", dep2ReplicaCount, accessMode) - statefulset2, deployment2, _ = createStsDeployment(ctx, client, namespace, sc, - true, true, sts2Replicas, "web-nginx", dep2ReplicaCount, accessMode) + ss2PodsBeforeScaleDown := fss.GetPodList(ctx, client, statefulset2) + replicas2 := *(statefulset2.Spec.Replicas) - ss2PodsBeforeScaleDown := fss.GetPodList(ctx, client, statefulset2) - replicas2 := *(statefulset2.Spec.Replicas) + if guestCluster { + svcCsipods, err = svcClient.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } - if guestCluster { - svcCsipods, err = svcClient.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + // Get the list of csi pods running in CSI namespace + csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } - // Get the list of csi pods running in CSI namespace - csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + sts1Replicas += 2 + ginkgo.By(fmt.Sprintf("Scaling up statefulset %v to number of Replica: %v", statefulset1.Name, sts1Replicas)) + fss.UpdateReplicas(ctx, client, statefulset1, sts1Replicas) + + sts2Replicas -= 2 + ginkgo.By(fmt.Sprintf("Scaling down statefulset: %v to number of Replica: %v", statefulset2.Name, sts2Replicas)) + fss.UpdateReplicas(ctx, client, statefulset2, sts2Replicas) - sts1Replicas += 2 - ginkgo.By(fmt.Sprintf("Scaling up statefulset %v to number of Replica: %v", statefulset1.Name, sts1Replicas)) - fss.UpdateReplicas(ctx, client, statefulset1, sts1Replicas) + if guestCluster { + svcCsipods, err = svcClient.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } - sts2Replicas -= 2 - ginkgo.By(fmt.Sprintf("Scaling down statefulset: %v to number of Replica: %v", statefulset2.Name, sts2Replicas)) - fss.UpdateReplicas(ctx, client, statefulset2, sts2Replicas) + ginkgo.By("Isolate secondary site from witness and primary site") + siteNetworkFailure(false, false) - ginkgo.By("Isolate secondary site from witness and primary site") - siteNetworkFailure(false, false) + defer func() { + ginkgo.By("Bring up the secondary site before terminating the test") + if len(fds.hostsPartitioned) > 0 && fds.hostsPartitioned != nil { + siteNetworkFailure(false, true) + fds.hostsPartitioned = nil + } + }() - defer func() { - ginkgo.By("Bring up the secondary site before terminating the test") - if len(fds.hostsPartitioned) > 0 && fds.hostsPartitioned != nil { - siteNetworkFailure(false, true) - fds.hostsPartitioned = nil + ginkgo.By("Wait for k8s cluster to be healthy") + if vanillaCluster { + wait4AllK8sNodesToBeUp(nodeList) + } + if vanillaCluster || guestCluster { + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) } - }() - ginkgo.By("Wait for k8s cluster to be healthy") - if vanillaCluster { - wait4AllK8sNodesToBeUp(ctx, client, nodeList) - } - if guestCluster || vanillaCluster { - err = waitForAllNodes2BeReady(ctx, client) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } + if guestCluster { + ginkgo.By("Check if csi pods are running fine after site failurein supervisor") + err = fpod.WaitForPodsRunningReady(ctx, svcClient, csiNs, int32(svcCsipods.Size()), 0, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } - if guestCluster { - ginkgo.By("Check if csi pods are running fine after site failure in supervisor") - err = fpod.WaitForPodsRunningReady(ctx, svcClient, csiNs, int32(svcCsipods.Size()), 0, pollTimeout) + // Check if csi pods are running fine after site failure + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } - - time.Sleep(5 * time.Minute) - // Check if csi pods are running fine after site failure - err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Verifying statefulset scale up/down went fine on sts1 and sts2") + // Scale up replicas of statefulset1 and verify CNS entries for volumes + scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset1, + sts1Replicas, false, true) + // Scale down replicas of statefulset2 and verify CNS entries for volumes + scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset2, + ss2PodsBeforeScaleDown, replicas2, false, true) - ginkgo.By("Verifying statefulset scale up/down went fine on sts1 and sts2") - // Scale up replicas of statefulset1 and verify CNS entries for volumes - scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset1, - sts1Replicas, false, true) - // Scale down replicas of statefulset2 and verify CNS entries for volumes - scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset2, - ss2PodsBeforeScaleDown, replicas2, false, true) + if rwxAccessMode { + dep1ReplicaCount += 2 + err = updateDeploymentReplicawithWait(client, dep1ReplicaCount, deployment1.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + verifyVolumeMetadataOnDeployments(ctx, client, deployment1, namespace, nil, nil, + nil, "") + dep2ReplicaCount -= 2 + err = updateDeploymentReplicawithWait(client, dep2ReplicaCount, deployment1.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + verifyVolumeMetadataOnDeployments(ctx, client, deployment2, namespace, nil, nil, + nil, "") + } - if rwxAccessMode { - dep1ReplicaCount += 2 - err = updateDeploymentReplicawithWait(client, dep1ReplicaCount, deployment1.Name, namespace) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - verifyVolumeMetadataOnDeployments(ctx, client, deployment1, namespace, nil, nil, - nil, "") - dep2ReplicaCount -= 2 - err = updateDeploymentReplicawithWait(client, dep2ReplicaCount, deployment1.Name, namespace) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - verifyVolumeMetadataOnDeployments(ctx, client, deployment2, namespace, nil, nil, - nil, "") - } + ginkgo.By("Verifying volume lifecycle actions works fine") + volumeLifecycleActions(ctx, client, namespace, sc, accessMode) - ginkgo.By("Verifying volume lifecycle actions works fine") - volumeLifecycleActions(ctx, client, namespace, sc, accessMode) + // Scaling up statefulset sts1 + sts1Replicas += 2 + scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset1, + sts1Replicas, true, false) - // Scaling up statefulset sts1 - sts1Replicas += 2 - scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset1, - sts1Replicas, true, false) + // Scaling down statefulset sts2 + sts2Replicas -= 2 + scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset2, + ss2PodsBeforeScaleDown, sts2Replicas, true, false) - // Scaling down statefulset sts2 - sts2Replicas -= 2 - scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset2, - ss2PodsBeforeScaleDown, sts2Replicas, true, false) + if rwxAccessMode { + dep1ReplicaCount += 2 + err = updateDeploymentReplicawithWait(client, dep1ReplicaCount, deployment1.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + verifyVolumeMetadataOnDeployments(ctx, client, deployment1, namespace, nil, nil, + nil, "") + dep2ReplicaCount -= 2 + err = updateDeploymentReplicawithWait(client, dep2ReplicaCount, deployment1.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + verifyVolumeMetadataOnDeployments(ctx, client, deployment2, namespace, nil, nil, + nil, "") + } - if rwxAccessMode { - dep1ReplicaCount += 2 - err = updateDeploymentReplicawithWait(client, dep1ReplicaCount, deployment1.Name, namespace) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - verifyVolumeMetadataOnDeployments(ctx, client, deployment1, namespace, nil, nil, - nil, "") - dep2ReplicaCount -= 2 - err = updateDeploymentReplicawithWait(client, dep2ReplicaCount, deployment1.Name, namespace) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - verifyVolumeMetadataOnDeployments(ctx, client, deployment2, namespace, nil, nil, - nil, "") - } + ginkgo.By("Bring up the secondary site by removing network failure") + if len(fds.hostsPartitioned) > 0 && fds.hostsPartitioned != nil { + siteNetworkFailure(false, true) + fds.hostsPartitioned = nil + } - ginkgo.By("Bring up the secondary site by removing network failure") - if len(fds.hostsPartitioned) > 0 && fds.hostsPartitioned != nil { - siteNetworkFailure(false, true) - fds.hostsPartitioned = nil - } + if guestCluster { + ginkgo.By("Check for nodes to be in Ready state in supervisor") + wait4AllK8sNodesToBeUp(svcNodeList) + err = waitForAllNodes2BeReady(ctx, svcClient) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } - if guestCluster { - ginkgo.By("Check for nodes to be in Ready state in supervisor") - wait4AllK8sNodesToBeUp(ctx, svcClient, svcNodeList) - err = waitForAllNodes2BeReady(ctx, svcClient) + ginkgo.By("Wait for k8s cluster to be healthy") + // wait for the VMs to move back + err = waitForAllNodes2BeReady(ctx, client) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } - - ginkgo.By("Wait for k8s cluster to be healthy") - // wait for the VMs to move back - err = waitForAllNodes2BeReady(ctx, client) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - - ginkgo.By("Verifying volume lifecycle actions works fine") - volumeLifecycleActions(ctx, client, namespace, sc, accessMode) - scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) + ginkgo.By("Verifying volume lifecycle actions works fine") + volumeLifecycleActions(ctx, client, namespace, sc, accessMode) - }) + scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) + }) /* Witness failure Steps: - 1. Configure a vanilla multi-master K8s cluster with inter and intra site replication + 1. Configure a vsan stretched cluster testbed. 2. Bring down the witness host 3. Run volume and application lifecycle actions, verify provisioning goes through but VM and storage compliance are false. @@ -2268,83 +2455,151 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f 5. Run volume and application lifecycle actions 6. Cleanup all objects created in step 3 and 5 */ - ginkgo.It("[distributed] Witness failure", func() { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - ginkgo.By("Creating StorageClass for Statefulset") - // decide which test setup is available to run - ginkgo.By("CNS_TEST: Running for vanilla k8s setup") - scParameters = map[string]string{} - scParameters["StoragePolicyName"] = storagePolicyName - storageClassName = "nginx-sc-default" - service := CreateService(namespace, client) - defer func() { - deleteService(namespace, client, service) - }() + ginkgo.It("Witness failure", + ginkgo.Label(p0, vsanStretch, block, vanilla, wcp, tkg, distributed), func() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() - csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + var stsReplicas, depReplicaCount int32 + var statefulset *appsv1.StatefulSet + var svcCsipods, csipods *v1.PodList + + if rwxAccessMode { + depReplicaCount = 3 + stsReplicas = 3 + } else { + depReplicaCount = 1 + stsReplicas = 4 + } - ginkgo.By("Bring down witness host") - toggleWitnessPowerState(ctx, true) + ginkgo.By("Creating service") + service := CreateService(namespace, client) + defer func() { + deleteService(namespace, client, service) + }() - defer func() { - ginkgo.By("Bring up the witness host before terminating the test") - if fds.witnessDown != "" { - toggleWitnessPowerState(ctx, false) + ginkgo.By("Get csi pods list before bringing down witness host") + if guestCluster { + svcCsipods, err = svcClient.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) } - }() - ginkgo.By("Wait for k8s cluster to be healthy") - wait4AllK8sNodesToBeUp(ctx, client, nodeList) - err = waitForAllNodes2BeReady(ctx, client) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + csipods, err = client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) - err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Bring down witness host") + toggleWitnessPowerState(ctx, true) + defer func() { + ginkgo.By("Bring up the witness host before terminating the test") + if fds.witnessDown != "" { + toggleWitnessPowerState(ctx, false) + } + }() - scSpec := getVSphereStorageClassSpec(storageClassName, scParameters, nil, "", "", false) - sc, err := client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - defer func() { - err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + ginkgo.By("Check if csi pods are running fine after witness failure") + if guestCluster { + err = fpod.WaitForPodsRunningReady(ctx, svcClient, csiNs, int32(svcCsipods.Size()), 0, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + if vanillaCluster { + wait4AllK8sNodesToBeUp(nodeList) + } + if vanillaCluster || guestCluster { + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + ginkgo.By("Check if csi pods are running fine after witness failure") + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }() - statefulset, _, _ := createStsDeployment(ctx, client, namespace, sc, true, false, 3, "", 1, - "") - replicas := *(statefulset.Spec.Replicas) - ssPodsBeforeScaleDown := fss.GetPodList(ctx, client, statefulset) - // Scale down replicas of statefulset and verify CNS entries for volumes - scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset, - ssPodsBeforeScaleDown, replicas-1, true, true) + ginkgo.By("Creating StorageClass") + if vanillaCluster { + ginkgo.By("CNS_TEST: Running for vanilla k8s setup") + scParameters = map[string]string{} + scParameters["StoragePolicyName"] = storagePolicyName + scSpec := getVSphereStorageClassSpec(defaultNginxStorageClassName, scParameters, nil, "", "", false) + sc, err = client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + defer func() { + err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + } else { + ginkgo.By("CNS_TEST: Running for GC setup") + sc, err = client.StorageV1().StorageClasses().Get(ctx, storagePolicyName, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } - comp := checkVmStorageCompliance(client, storagePolicyName) - if !comp { - framework.Failf("Expected VM and storage compliance to be false but found true") - } + ginkgo.By("Creating statefulset and deployment with volumes from the stretched datastore") + statefulset, deployment, _ := createStsDeployment(ctx, client, namespace, sc, true, + false, stsReplicas, "", depReplicaCount, accessMode) + ssPodsBeforeScaleDown := fss.GetPodList(ctx, client, statefulset) + defer func() { + scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) + pvcs, err := client.CoreV1().PersistentVolumeClaims(namespace).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for _, claim := range pvcs.Items { + pv := getPvFromClaim(client, namespace, claim.Name) + err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Verify it's PV and corresponding volumes are deleted from CNS") + err = fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, + pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + volumeHandle := pv.Spec.CSI.VolumeHandle + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred(), + fmt.Sprintf("Volume: %s should not be present in the CNS after it is deleted from "+ + "kubernetes", volumeHandle)) + } + }() - ginkgo.By("Bring up witness host") - if fds.witnessDown != "" { - toggleWitnessPowerState(ctx, false) - } + ginkgo.By("Check storage compliance") + comp := checkVmStorageCompliance(client, storagePolicyName) + if !comp { + framework.Failf("Expected VM and storage compliance to be false but found true") + } - ginkgo.By("Wait for k8s cluster to be healthy") - // wait for the VMs to move back - err = waitForAllNodes2BeReady(ctx, client) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Bring up witness host") + if fds.witnessDown != "" { + toggleWitnessPowerState(ctx, false) + } - ginkgo.By("Verifying volume lifecycle actions works fine") - volumeLifecycleActions(ctx, client, namespace, sc, "") + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) - ginkgo.By(fmt.Sprintf("Scaling up statefulset: %v to number of Replica: %v", - statefulset.Name, replicas)) - // Scale up replicas of statefulset and verify CNS entries for volumes - scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset, - replicas, true, true) + ginkgo.By("Verifying volume lifecycle actions works fine") + volumeLifecycleActions(ctx, client, namespace, sc, "") - scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) - }) + ginkgo.By("Performing scaledown operation on statefulset") + stsReplicas = 2 + scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset, + ssPodsBeforeScaleDown, stsReplicas, true, true) + + ginkgo.By("Performing scaleup operation on statefulset") + stsReplicas = 6 + scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset, + stsReplicas, true, true) + + if rwxAccessMode { + ginkgo.By("Performing scaleup operation on deployment") + depReplicaCount = 4 + updateDeploymentReplicawithWait(client, depReplicaCount, deployment.Name, namespace) + verifyVolumeMetadataOnDeployments(ctx, client, deployment, namespace, nil, nil, + nil, "") + } + + ginkgo.By("Check storage compliance") + comp = checkVmStorageCompliance(client, storagePolicyName) + if !comp { + framework.Failf("Expected VM and storage compliance to be true but found false") + } + + ginkgo.By("Scale down statefulset and deployment after site recovery") + scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) + }) /* Statefulset scale up/down while secondary site goes down when csi provisioner and @@ -2363,50 +2618,120 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f 9. Delete statefulsets and its pvcs created in step 2 10. Bring secondary site up and wait for testbed to be back to normal */ - ginkgo.It("[distributed] Statefulset scale up/down while secondary site goes down when csi provisioner"+ - " and attacher leaders are in secondary site", func() { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - ginkgo.By("Creating StorageClass for Statefulset") - // decide which test setup is available to run - ginkgo.By("CNS_TEST: Running for vanilla k8s setup") - scParameters = map[string]string{} - scParameters["StoragePolicyName"] = storageThickPolicyName - storageClassName = "nginx-sc-thick" - - scSpec := getVSphereStorageClassSpec(storageClassName, scParameters, nil, "", "", false) - sc, err := client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - defer func() { - err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + ginkgo.It("Statefulset scale up/down while secondary site goes down when csi provisioner"+ + " and attacher leaders are in secondary site", + ginkgo.Label(p0, vsanStretch, block, vanilla, wcp, tkg, distributed), func() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + ginkgo.By("Creating StorageClass for Statefulset") + // decide which test setup is available to run + ginkgo.By("CNS_TEST: Running for vanilla k8s setup") + scParameters = map[string]string{} + scParameters["StoragePolicyName"] = storageThickPolicyName + storageClassName = "nginx-sc-thick" + + scSpec := getVSphereStorageClassSpec(storageClassName, scParameters, nil, "", "", false) + sc, err := client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }() + defer func() { + err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() - framework.Logf("Ensuring %s leader is in secondary site", provisionerContainerName) - err = changeLeaderOfContainerToComeUpOnMaster(ctx, client, sshClientConfig, provisionerContainerName, false) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + framework.Logf("Ensuring %s leader is in secondary site", provisionerContainerName) + err = changeLeaderOfContainerToComeUpOnMaster(ctx, client, sshClientConfig, provisionerContainerName, false) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) - framework.Logf("Ensuring %s leader is in secondary site", attacherContainerName) - err = changeLeaderOfContainerToComeUpOnMaster(ctx, client, sshClientConfig, attacherContainerName, false) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + framework.Logf("Ensuring %s leader is in secondary site", attacherContainerName) + err = changeLeaderOfContainerToComeUpOnMaster(ctx, client, sshClientConfig, attacherContainerName, false) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) - ginkgo.By("Creating service") - service := CreateService(namespace, client) - defer func() { - deleteService(namespace, client, service) - }() + ginkgo.By("Creating service") + service := CreateService(namespace, client) + defer func() { + deleteService(namespace, client, service) + }() - ginkgo.By("Creating statefulsets sts1 with replica count 1 and sts2 with 5 and wait for all" + - "the replicas to be running") - statefulset1, _, _ := createStsDeployment(ctx, client, namespace, sc, false, true, 1, "web", 1, - accessMode) - replicas1 := *(statefulset1.Spec.Replicas) - statefulset2, _, _ := createStsDeployment(ctx, client, namespace, sc, false, true, 5, "web-nginx", 1, - accessMode) - ss2PodsBeforeScaleDown := fss.GetPodList(ctx, client, statefulset2) - replicas2 := *(statefulset2.Spec.Replicas) + ginkgo.By("Creating statefulsets sts1 with replica count 1 and sts2 with 5 and wait for all" + + "the replicas to be running") + statefulset1, _, _ := createStsDeployment(ctx, client, namespace, sc, false, true, 1, "web", 1, + accessMode) + replicas1 := *(statefulset1.Spec.Replicas) + statefulset2, _, _ := createStsDeployment(ctx, client, namespace, sc, false, true, 5, "web-nginx", 1, + accessMode) + ss2PodsBeforeScaleDown := fss.GetPodList(ctx, client, statefulset2) + replicas2 := *(statefulset2.Spec.Replicas) + + defer func() { + scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) + pvcs, err := client.CoreV1().PersistentVolumeClaims(namespace).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for _, claim := range pvcs.Items { + pv := getPvFromClaim(client, namespace, claim.Name) + err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Verify it's PV and corresponding volumes are deleted from CNS") + err = fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, + pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + volumeHandle := pv.Spec.CSI.VolumeHandle + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred(), + fmt.Sprintf("Volume: %s should not be present in the CNS after it is deleted from "+ + "kubernetes", volumeHandle)) + } + }() + + // Get the list of csi pods running in CSI namespace + csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + replicas1 += 2 + ginkgo.By(fmt.Sprintf("Scaling up statefulset %v to number of Replica: %v", statefulset1.Name, replicas1)) + fss.UpdateReplicas(ctx, client, statefulset1, replicas1) + + replicas2 -= 2 + ginkgo.By(fmt.Sprintf("Scaling down statefulset: %v to number of Replica: %v", statefulset2.Name, replicas2)) + fss.UpdateReplicas(ctx, client, statefulset2, replicas2) + + ginkgo.By("Bring down the secondary site") + siteFailover(ctx, false) + + defer func() { + ginkgo.By("Bring up the secondary site before terminating the test") + if len(fds.hostsDown) > 0 { + siteRestore(false) + fds.hostsDown = []string{} + } + }() + + ginkgo.By("Wait for k8s cluster to be healthy") + wait4AllK8sNodesToBeUp(nodeList) + err = waitForAllNodes2BeReady(ctx, client, pollTimeout*4) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + // Check if csi pods are running fine after site failure + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout*2) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Verifying statefulset scale up/down went fine on sts1 and sts2") + // Scale up replicas of statefulset1 and verify CNS entries for volumes + scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset1, + replicas1, false, true) + // Scale down replicas of statefulset2 and verify CNS entries for volumes + scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset2, + ss2PodsBeforeScaleDown, replicas2, false, true) + + // Scaling up statefulset sts1 + replicas1 += 2 + scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset1, + replicas1, true, false) + + // Scaling down statefulset sts2 + replicas2 -= 2 + scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset2, + ss2PodsBeforeScaleDown, replicas2, true, false) - defer func() { scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) pvcs, err := client.CoreV1().PersistentVolumeClaims(namespace).List(ctx, metav1.ListOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) @@ -2424,87 +2749,18 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f fmt.Sprintf("Volume: %s should not be present in the CNS after it is deleted from "+ "kubernetes", volumeHandle)) } - }() - - // Get the list of csi pods running in CSI namespace - csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - - replicas1 += 2 - ginkgo.By(fmt.Sprintf("Scaling up statefulset %v to number of Replica: %v", statefulset1.Name, replicas1)) - fss.UpdateReplicas(ctx, client, statefulset1, replicas1) - - replicas2 -= 2 - ginkgo.By(fmt.Sprintf("Scaling down statefulset: %v to number of Replica: %v", statefulset2.Name, replicas2)) - fss.UpdateReplicas(ctx, client, statefulset2, replicas2) - - ginkgo.By("Bring down the secondary site") - siteFailover(ctx, false) - defer func() { - ginkgo.By("Bring up the secondary site before terminating the test") + ginkgo.By("Bring up the secondary site") if len(fds.hostsDown) > 0 { siteRestore(false) fds.hostsDown = []string{} } - }() - - ginkgo.By("Wait for k8s cluster to be healthy") - wait4AllK8sNodesToBeUp(ctx, client, nodeList) - err = waitForAllNodes2BeReady(ctx, client, pollTimeout*4) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - - // Check if csi pods are running fine after site failure - err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout*2) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - ginkgo.By("Verifying statefulset scale up/down went fine on sts1 and sts2") - // Scale up replicas of statefulset1 and verify CNS entries for volumes - scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset1, - replicas1, false, true) - // Scale down replicas of statefulset2 and verify CNS entries for volumes - scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset2, - ss2PodsBeforeScaleDown, replicas2, false, true) - - // Scaling up statefulset sts1 - replicas1 += 2 - scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset1, - replicas1, true, false) - - // Scaling down statefulset sts2 - replicas2 -= 2 - scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset2, - ss2PodsBeforeScaleDown, replicas2, true, false) - - scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) - pvcs, err := client.CoreV1().PersistentVolumeClaims(namespace).List(ctx, metav1.ListOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - for _, claim := range pvcs.Items { - pv := getPvFromClaim(client, namespace, claim.Name) - err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - ginkgo.By("Verify it's PV and corresponding volumes are deleted from CNS") - err = fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, - pollTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - volumeHandle := pv.Spec.CSI.VolumeHandle - err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) - gomega.Expect(err).NotTo(gomega.HaveOccurred(), - fmt.Sprintf("Volume: %s should not be present in the CNS after it is deleted from "+ - "kubernetes", volumeHandle)) - } - - ginkgo.By("Bring up the secondary site") - if len(fds.hostsDown) > 0 { - siteRestore(false) - fds.hostsDown = []string{} - } - - ginkgo.By("Wait for k8s cluster to be healthy") - // wait for the VMs to move back - err = waitForAllNodes2BeReady(ctx, client, pollTimeout*4) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }) + ginkgo.By("Wait for k8s cluster to be healthy") + // wait for the VMs to move back + err = waitForAllNodes2BeReady(ctx, client, pollTimeout*4) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }) /* Label updates to PV, PVC while primary site goes down @@ -2521,143 +2777,144 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f 9. Delete the PVCs created in step 2 10. Bring secondary site up and wait for testbed to be back to normal */ - ginkgo.It("[distributed] Label updates to PV, PVC while primary site goes down"+ - " when syncer pod leader is in secondary site", func() { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - ginkgo.By("Creating StorageClass") - // decide which test setup is available to run - ginkgo.By("CNS_TEST: Running for vanilla k8s setup") - scParameters = map[string]string{} - scParameters["StoragePolicyName"] = storageThickPolicyName - storageClassName = "nginx-sc-default" - var pvclaims []*v1.PersistentVolumeClaim - if os.Getenv(envFullSyncWaitTime) != "" { - fullSyncWaitTime, err := strconv.Atoi(os.Getenv(envFullSyncWaitTime)) - framework.Logf("Full-Sync interval time value is = %v", fullSyncWaitTime) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } else { - fullSyncWaitTime = defaultFullSyncWaitTime - } + ginkgo.It("Label updates to PV, PVC while primary site goes down"+ + " when syncer pod leader is in secondary site", + ginkgo.Label(p0, vsanStretch, block, vanilla, distributed), func() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + ginkgo.By("Creating StorageClass") + // decide which test setup is available to run + ginkgo.By("CNS_TEST: Running for vanilla k8s setup") + scParameters = map[string]string{} + scParameters["StoragePolicyName"] = storageThickPolicyName + storageClassName = "nginx-sc-default" + var pvclaims []*v1.PersistentVolumeClaim + if os.Getenv(envFullSyncWaitTime) != "" { + fullSyncWaitTime, err := strconv.Atoi(os.Getenv(envFullSyncWaitTime)) + framework.Logf("Full-Sync interval time value is = %v", fullSyncWaitTime) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } else { + fullSyncWaitTime = defaultFullSyncWaitTime + } - scSpec := getVSphereStorageClassSpec(storageClassName, scParameters, nil, "", "", false) - sc, err := client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - defer func() { - err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + scSpec := getVSphereStorageClassSpec(storageClassName, scParameters, nil, "", "", false) + sc, err := client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }() - - framework.Logf("Ensuring %s leader is in secondary site", syncerContainerName) - err = changeLeaderOfContainerToComeUpOnMaster(ctx, client, sshClientConfig, syncerContainerName, false) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + defer func() { + err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() - for i := 0; i < volumeOpsScale; i++ { - framework.Logf("Creating pvc") - pvc, err := createPVC(ctx, client, namespace, nil, diskSize, sc, "") - pvclaims = append(pvclaims, pvc) + framework.Logf("Ensuring %s leader is in secondary site", syncerContainerName) + err = changeLeaderOfContainerToComeUpOnMaster(ctx, client, sshClientConfig, syncerContainerName, false) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } - persistentvolumes, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaims, framework.ClaimProvisionTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - for i := 0; i < volumeOpsScale; i++ { - volHandle := persistentvolumes[i].Spec.CSI.VolumeHandle - gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) - } - defer func() { - for _, claim := range pvclaims { - err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } - ginkgo.By("Verify PVs, volumes are deleted from CNS") - for _, pv := range persistentvolumes { - err := fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, - pollTimeout) + for i := 0; i < volumeOpsScale; i++ { + framework.Logf("Creating pvc") + pvc, err := createPVC(ctx, client, namespace, nil, diskSize, sc, "") + pvclaims = append(pvclaims, pvc) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - volumeHandle := pv.Spec.CSI.VolumeHandle - err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) - gomega.Expect(err).NotTo(gomega.HaveOccurred(), - fmt.Sprintf("Volume: %s should not be present in the CNS after it is deleted from "+ - "kubernetes", volumeHandle)) } - }() - - // Get the list of csi pods running in CSI namespace - csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - ginkgo.By("Bring down the secondary site while adding labels to PVCs and PVs") - var wg sync.WaitGroup - labels := make(map[string]string) - labels[labelKey] = labelValue + persistentvolumes, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaims, framework.ClaimProvisionTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for i := 0; i < volumeOpsScale; i++ { + volHandle := persistentvolumes[i].Spec.CSI.VolumeHandle + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + } + defer func() { + for _, claim := range pvclaims { + err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + ginkgo.By("Verify PVs, volumes are deleted from CNS") + for _, pv := range persistentvolumes { + err := fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, + pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + volumeHandle := pv.Spec.CSI.VolumeHandle + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred(), + fmt.Sprintf("Volume: %s should not be present in the CNS after it is deleted from "+ + "kubernetes", volumeHandle)) + } + }() - wg.Add(3) - go updatePvcLabelsInParallel(ctx, client, namespace, labels, pvclaims, &wg) - go updatePvLabelsInParallel(ctx, client, namespace, labels, persistentvolumes, &wg) - go siteFailureInParallel(ctx, false, &wg) - wg.Wait() + // Get the list of csi pods running in CSI namespace + csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) - ginkgo.By("Wait for k8s cluster to be healthy") - wait4AllK8sNodesToBeUp(ctx, client, nodeList) - err = waitForAllNodes2BeReady(ctx, client, pollTimeout*4) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Bring down the secondary site while adding labels to PVCs and PVs") + var wg sync.WaitGroup + labels := make(map[string]string) + labels[labelKey] = labelValue - framework.Logf("Sleeping full-sync interval for volumes to be updated " + - "with labels in CNS") - time.Sleep(time.Duration(fullSyncWaitTime) * time.Second) + wg.Add(3) + go updatePvcLabelsInParallel(ctx, client, namespace, labels, pvclaims, &wg) + go updatePvLabelsInParallel(ctx, client, namespace, labels, persistentvolumes, &wg) + go siteFailureInParallel(ctx, false, &wg) + wg.Wait() - // Check if csi pods are running fine after site failure - err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout*2) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Wait for k8s cluster to be healthy") + wait4AllK8sNodesToBeUp(nodeList) + err = waitForAllNodes2BeReady(ctx, client, pollTimeout*4) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) - persistentvolumes, err = fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaims, framework.ClaimProvisionTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - for i := 0; i < volumeOpsScale; i++ { - volHandle := persistentvolumes[i].Spec.CSI.VolumeHandle - gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) - } + framework.Logf("Sleeping full-sync interval for volumes to be updated " + + "with labels in CNS") + time.Sleep(time.Duration(fullSyncWaitTime) * time.Second) - for _, pvc := range pvclaims { - ginkgo.By(fmt.Sprintf("Waiting for labels %+v to be updated for pvc %s in namespace %s", - labels, pvc.Name, namespace)) - pv := getPvFromClaim(client, namespace, pvc.Name) - err = e2eVSphere.waitForLabelsToBeUpdated(pv.Spec.CSI.VolumeHandle, labels, - string(cnstypes.CnsKubernetesEntityTypePVC), pvc.Name, pvc.Namespace) + // Check if csi pods are running fine after site failure + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout*2) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } - for _, pv := range persistentvolumes { - ginkgo.By(fmt.Sprintf("Waiting for labels %+v to be updated for pv %s", - labels, pv.Name)) - err = e2eVSphere.waitForLabelsToBeUpdated(pv.Spec.CSI.VolumeHandle, labels, - string(cnstypes.CnsKubernetesEntityTypePV), pv.Name, pv.Namespace) + persistentvolumes, err = fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaims, framework.ClaimProvisionTimeout) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } + for i := 0; i < volumeOpsScale; i++ { + volHandle := persistentvolumes[i].Spec.CSI.VolumeHandle + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + } - for _, pvclaim := range pvclaims { - err = fpv.DeletePersistentVolumeClaim(ctx, client, pvclaim.Name, namespace) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } - ginkgo.By("Verify PVs, volumes are deleted from CNS") - for _, pv := range persistentvolumes { - err := fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, - pollTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - volumeHandle := pv.Spec.CSI.VolumeHandle - err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } + for _, pvc := range pvclaims { + ginkgo.By(fmt.Sprintf("Waiting for labels %+v to be updated for pvc %s in namespace %s", + labels, pvc.Name, namespace)) + pv := getPvFromClaim(client, namespace, pvc.Name) + err = e2eVSphere.waitForLabelsToBeUpdated(pv.Spec.CSI.VolumeHandle, labels, + string(cnstypes.CnsKubernetesEntityTypePVC), pvc.Name, pvc.Namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } - ginkgo.By("Bring up the primary site") - siteRestore(false) + for _, pv := range persistentvolumes { + ginkgo.By(fmt.Sprintf("Waiting for labels %+v to be updated for pv %s", + labels, pv.Name)) + err = e2eVSphere.waitForLabelsToBeUpdated(pv.Spec.CSI.VolumeHandle, labels, + string(cnstypes.CnsKubernetesEntityTypePV), pv.Name, pv.Namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } - ginkgo.By("Wait for k8s cluster to be healthy") - // wait for the VMs to move back - err = waitForAllNodes2BeReady(ctx, client, pollTimeout*4) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for _, pvclaim := range pvclaims { + err = fpv.DeletePersistentVolumeClaim(ctx, client, pvclaim.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + ginkgo.By("Verify PVs, volumes are deleted from CNS") + for _, pv := range persistentvolumes { + err := fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, + pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + volumeHandle := pv.Spec.CSI.VolumeHandle + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } - }) + ginkgo.By("Bring up the primary site") + siteRestore(false) + + ginkgo.By("Wait for k8s cluster to be healthy") + // wait for the VMs to move back + err = waitForAllNodes2BeReady(ctx, client, pollTimeout*4) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + }) /* Statefulset scale up/down while secondary site goes down when csi driver leader is in secondary site @@ -2675,93 +2932,163 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f 9. Delete statefulsets and its pvcs created in step 2 10. Bring secondary site up and wait for testbed to be back to normal */ - ginkgo.It("[distributed] Statefulset scale up/down while secondary site goes down"+ - " when csi driver leader is in secondary site", func() { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - ginkgo.By("Creating StorageClass for Statefulset") - // decide which test setup is available to run - ginkgo.By("CNS_TEST: Running for vanilla k8s setup") - scParameters = map[string]string{} - scParameters["StoragePolicyName"] = storageThickPolicyName - storageClassName = "nginx-sc-thick" - csiPodOnSite, nodeName := "", "" - ignoreLabels := make(map[string]string) - - scSpec := getVSphereStorageClassSpec(storageClassName, scParameters, nil, "", "", false) - sc, err := client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - defer func() { - err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + ginkgo.It("Statefulset scale up/down while secondary site goes down"+ + " when csi driver leader is in secondary site", + ginkgo.Label(p0, vsanStretch, block, vanilla, distributed), func() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + ginkgo.By("Creating StorageClass for Statefulset") + // decide which test setup is available to run + ginkgo.By("CNS_TEST: Running for vanilla k8s setup") + scParameters = map[string]string{} + scParameters["StoragePolicyName"] = storageThickPolicyName + storageClassName = "nginx-sc-thick" + csiPodOnSite, nodeName := "", "" + ignoreLabels := make(map[string]string) + + scSpec := getVSphereStorageClassSpec(storageClassName, scParameters, nil, "", "", false) + sc, err := client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }() + defer func() { + err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() - framework.Logf("Ensuring %s leader is in secondary site", csiDriverContainerName) - // Fetch master ip present on that site - masterIpOnSite, err := getMasterIpOnSite(ctx, client, false) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + framework.Logf("Ensuring %s leader is in secondary site", csiDriverContainerName) + // Fetch master ip present on that site + masterIpOnSite, err := getMasterIpOnSite(ctx, client, false) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) - // Fetch the name of master node on that site from the IP address - k8sNodes, err := client.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - for _, node := range k8sNodes.Items { - addrs := node.Status.Addresses - for _, addr := range addrs { - if addr.Type == v1.NodeInternalIP && (net.ParseIP(addr.Address)).To4() != nil && - addr.Address == masterIpOnSite { - nodeName = node.Name - break + // Fetch the name of master node on that site from the IP address + k8sNodes, err := client.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for _, node := range k8sNodes.Items { + addrs := node.Status.Addresses + for _, addr := range addrs { + if addr.Type == v1.NodeInternalIP && (net.ParseIP(addr.Address)).To4() != nil && + addr.Address == masterIpOnSite { + nodeName = node.Name + break + } } } - } - // Get the name pf csi controller pod running on master node on that site - csiPods, err := fpod.GetPodsInNamespace(ctx, client, csiSystemNamespace, ignoreLabels) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - for _, csiPod := range csiPods { - if strings.Contains(csiPod.Name, vSphereCSIControllerPodNamePrefix) && - csiPod.Spec.NodeName == nodeName { - csiPodOnSite = csiPod.Name + // Get the name pf csi controller pod running on master node on that site + csiPods, err := fpod.GetPodsInNamespace(ctx, client, csiSystemNamespace, ignoreLabels) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for _, csiPod := range csiPods { + if strings.Contains(csiPod.Name, vSphereCSIControllerPodNamePrefix) && + csiPod.Spec.NodeName == nodeName { + csiPodOnSite = csiPod.Name + } } - } - // Delete csi controller pods on other masters which is not present on that site - deleteCsiControllerPodOnOtherMasters(ctx, client, csiPodOnSite) - csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - masterIpOnSecSite, err := getMasterIpOnSite(ctx, client, false) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - allCsiContainerNames := []string{provisionerContainerName, attacherContainerName, - resizerContainerName, snapshotterContainerName} - - for _, containerName := range allCsiContainerNames { - _, masterIp, err := getK8sMasterNodeIPWhereContainerLeaderIsRunning(ctx, client, - sshClientConfig, containerName) + // Delete csi controller pods on other masters which is not present on that site + deleteCsiControllerPodOnOtherMasters(ctx, client, csiPodOnSite) + csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + masterIpOnSecSite, err := getMasterIpOnSite(ctx, client, false) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - if masterIp != masterIpOnSecSite { - framework.Failf("couldn't get :%s container on master node ip: %s", - containerName, masterIpOnSite) + allCsiContainerNames := []string{provisionerContainerName, attacherContainerName, + resizerContainerName, snapshotterContainerName} + + for _, containerName := range allCsiContainerNames { + _, masterIp, err := getK8sMasterNodeIPWhereContainerLeaderIsRunning(ctx, client, + sshClientConfig, containerName) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + if masterIp != masterIpOnSecSite { + framework.Failf("couldn't get :%s container on master node ip: %s", + containerName, masterIpOnSite) + } } - } - ginkgo.By("Creating service") - service := CreateService(namespace, client) - defer func() { - deleteService(namespace, client, service) - }() + ginkgo.By("Creating service") + service := CreateService(namespace, client) + defer func() { + deleteService(namespace, client, service) + }() - ginkgo.By("Creating statefulsets sts1 with replica count 1 and sts2 with 5 and wait for all" + - "the replicas to be running") - statefulset1, _, _ := createStsDeployment(ctx, client, namespace, sc, false, true, 1, "web", 1, - accessMode) - replicas1 := *(statefulset1.Spec.Replicas) - statefulset2, _, _ := createStsDeployment(ctx, client, namespace, sc, false, true, 5, "web-nginx", 1, - accessMode) - ss2PodsBeforeScaleDown := fss.GetPodList(ctx, client, statefulset2) - replicas2 := *(statefulset2.Spec.Replicas) + ginkgo.By("Creating statefulsets sts1 with replica count 1 and sts2 with 5 and wait for all" + + "the replicas to be running") + statefulset1, _, _ := createStsDeployment(ctx, client, namespace, sc, false, true, 1, "web", 1, + accessMode) + replicas1 := *(statefulset1.Spec.Replicas) + statefulset2, _, _ := createStsDeployment(ctx, client, namespace, sc, false, true, 5, "web-nginx", 1, + accessMode) + ss2PodsBeforeScaleDown := fss.GetPodList(ctx, client, statefulset2) + replicas2 := *(statefulset2.Spec.Replicas) + + defer func() { + scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) + pvcs, err := client.CoreV1().PersistentVolumeClaims(namespace).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for _, claim := range pvcs.Items { + pv := getPvFromClaim(client, namespace, claim.Name) + err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Verify it's PV and corresponding volumes are deleted from CNS") + err = fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, + pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + volumeHandle := pv.Spec.CSI.VolumeHandle + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred(), + fmt.Sprintf("Volume: %s should not be present in the CNS after it is deleted from "+ + "kubernetes", volumeHandle)) + } + }() + + // Get the list of csi pods running in CSI namespace + csipods, err = client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + replicas1 += 2 + ginkgo.By(fmt.Sprintf("Scaling up statefulset %v to number of Replica: %v", statefulset1.Name, replicas1)) + fss.UpdateReplicas(ctx, client, statefulset1, replicas1) + + replicas2 -= 2 + ginkgo.By(fmt.Sprintf("Scaling down statefulset: %v to number of Replica: %v", statefulset2.Name, replicas2)) + fss.UpdateReplicas(ctx, client, statefulset2, replicas2) + + ginkgo.By("Bring down the secondary site") + siteFailover(ctx, false) + + defer func() { + ginkgo.By("Bring up the secondary site before terminating the test") + if len(fds.hostsDown) > 0 { + siteRestore(false) + fds.hostsDown = []string{} + } + }() + + ginkgo.By("Wait for k8s cluster to be healthy") + wait4AllK8sNodesToBeUp(nodeList) + err = waitForAllNodes2BeReady(ctx, client, pollTimeout*4) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + // Check if csi pods are running fine after site failure + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout*2) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Verifying statefulset scale up/down went fine on sts1 and sts2") + // Scale up replicas of statefulset1 and verify CNS entries for volumes + scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset1, + replicas1, false, true) + // Scale down replicas of statefulset2 and verify CNS entries for volumes + scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset2, + ss2PodsBeforeScaleDown, replicas2, false, true) + + // Scaling up statefulset sts1 + replicas1 += 2 + scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset1, + replicas1, true, false) + + // Scaling down statefulset sts2 + replicas2 -= 2 + scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset2, + ss2PodsBeforeScaleDown, replicas2, true, false) - defer func() { scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) pvcs, err := client.CoreV1().PersistentVolumeClaims(namespace).List(ctx, metav1.ListOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) @@ -2779,92 +3106,23 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f fmt.Sprintf("Volume: %s should not be present in the CNS after it is deleted from "+ "kubernetes", volumeHandle)) } - }() - // Get the list of csi pods running in CSI namespace - csipods, err = client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - - replicas1 += 2 - ginkgo.By(fmt.Sprintf("Scaling up statefulset %v to number of Replica: %v", statefulset1.Name, replicas1)) - fss.UpdateReplicas(ctx, client, statefulset1, replicas1) - - replicas2 -= 2 - ginkgo.By(fmt.Sprintf("Scaling down statefulset: %v to number of Replica: %v", statefulset2.Name, replicas2)) - fss.UpdateReplicas(ctx, client, statefulset2, replicas2) - - ginkgo.By("Bring down the secondary site") - siteFailover(ctx, false) - - defer func() { - ginkgo.By("Bring up the secondary site before terminating the test") + ginkgo.By("Bring up the secondary site") if len(fds.hostsDown) > 0 { siteRestore(false) fds.hostsDown = []string{} } - }() - - ginkgo.By("Wait for k8s cluster to be healthy") - wait4AllK8sNodesToBeUp(ctx, client, nodeList) - err = waitForAllNodes2BeReady(ctx, client, pollTimeout*4) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - - // Check if csi pods are running fine after site failure - err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout*2) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - - ginkgo.By("Verifying statefulset scale up/down went fine on sts1 and sts2") - // Scale up replicas of statefulset1 and verify CNS entries for volumes - scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset1, - replicas1, false, true) - // Scale down replicas of statefulset2 and verify CNS entries for volumes - scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset2, - ss2PodsBeforeScaleDown, replicas2, false, true) - - // Scaling up statefulset sts1 - replicas1 += 2 - scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset1, - replicas1, true, false) - - // Scaling down statefulset sts2 - replicas2 -= 2 - scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset2, - ss2PodsBeforeScaleDown, replicas2, true, false) - scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) - pvcs, err := client.CoreV1().PersistentVolumeClaims(namespace).List(ctx, metav1.ListOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - for _, claim := range pvcs.Items { - pv := getPvFromClaim(client, namespace, claim.Name) - err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - ginkgo.By("Verify it's PV and corresponding volumes are deleted from CNS") - err = fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, - pollTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - volumeHandle := pv.Spec.CSI.VolumeHandle - err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) - gomega.Expect(err).NotTo(gomega.HaveOccurred(), - fmt.Sprintf("Volume: %s should not be present in the CNS after it is deleted from "+ - "kubernetes", volumeHandle)) - } - - ginkgo.By("Bring up the secondary site") - if len(fds.hostsDown) > 0 { - siteRestore(false) - fds.hostsDown = []string{} - } - - ginkgo.By("Wait for k8s cluster to be healthy") - // wait for the VMs to move back - err = waitForAllNodes2BeReady(ctx, client, pollTimeout*4) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }) + ginkgo.By("Wait for k8s cluster to be healthy") + // wait for the VMs to move back + err = waitForAllNodes2BeReady(ctx, client, pollTimeout*4) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }) /* Statefulset scale up/down while secondary site goes down Steps: - 1. Configure a vanilla multi-master K8s cluster with inter and intra site replication + 1. Configure a vsan stretched cluster testbed. 2. Create two statefulset with replica count 1(sts1) and 5(sts2) respectively and wait for all replicas to be running 3. Change replica count of sts1 and sts2 to 3 @@ -2879,41 +3137,115 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f 10. Delete the PVCs used by statefulsets in step 2 11. Bring secondary site up and wait for testbed to be back to normal */ - ginkgo.It("[control-plane-on-primary] Statefulset scale up/down while secondary site goes down", func() { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - ginkgo.By("Creating StorageClass for Statefulset") - // decide which test setup is available to run - ginkgo.By("CNS_TEST: Running for vanilla k8s setup") - scParameters = map[string]string{} - scParameters["StoragePolicyName"] = storageThickPolicyName - storageClassName = "nginx-sc-thick" - - scSpec := getVSphereStorageClassSpec(storageClassName, scParameters, nil, "", "", false) - sc, err := client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - defer func() { - err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + ginkgo.It("Statefulset scale up/down while secondary site goes down", + ginkgo.Label(p0, vsanStretch, block, vanilla, wcp, tkg, controlPlaneOnPrimary), func() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + ginkgo.By("Creating StorageClass for Statefulset") + // decide which test setup is available to run + ginkgo.By("CNS_TEST: Running for vanilla k8s setup") + scParameters = map[string]string{} + scParameters["StoragePolicyName"] = storageThickPolicyName + storageClassName = "nginx-sc-thick" + + scSpec := getVSphereStorageClassSpec(storageClassName, scParameters, nil, "", "", false) + sc, err := client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }() + defer func() { + err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() - ginkgo.By("Creating service") - service := CreateService(namespace, client) - defer func() { - deleteService(namespace, client, service) - }() + ginkgo.By("Creating service") + service := CreateService(namespace, client) + defer func() { + deleteService(namespace, client, service) + }() - ginkgo.By("Creating statefulsets sts1 with replica count 1 and sts2 with 5 and wait for all" + - "the replicas to be running") - statefulset1, _, _ := createStsDeployment(ctx, client, namespace, sc, false, true, 1, "web", 1, - accessMode) - replicas1 := *(statefulset1.Spec.Replicas) - statefulset2, _, _ := createStsDeployment(ctx, client, namespace, sc, false, true, 5, "web-nginx", 1, - accessMode) - ss2PodsBeforeScaleDown := fss.GetPodList(ctx, client, statefulset2) - replicas2 := *(statefulset2.Spec.Replicas) + ginkgo.By("Creating statefulsets sts1 with replica count 1 and sts2 with 5 and wait for all" + + "the replicas to be running") + statefulset1, _, _ := createStsDeployment(ctx, client, namespace, sc, false, true, 1, "web", 1, + accessMode) + replicas1 := *(statefulset1.Spec.Replicas) + statefulset2, _, _ := createStsDeployment(ctx, client, namespace, sc, false, true, 5, "web-nginx", 1, + accessMode) + ss2PodsBeforeScaleDown := fss.GetPodList(ctx, client, statefulset2) + replicas2 := *(statefulset2.Spec.Replicas) + + defer func() { + scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) + pvcs, err := client.CoreV1().PersistentVolumeClaims(namespace).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for _, claim := range pvcs.Items { + pv := getPvFromClaim(client, namespace, claim.Name) + err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Verify it's PV and corresponding volumes are deleted from CNS") + err = fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, + pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + volumeHandle := pv.Spec.CSI.VolumeHandle + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred(), + fmt.Sprintf("Volume: %s should not be present in the CNS after it is deleted from "+ + "kubernetes", volumeHandle)) + } + }() + + // Get the list of csi pods running in CSI namespace + csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + replicas1 += 2 + ginkgo.By(fmt.Sprintf("Scaling up statefulset %v to number of Replica: %v", statefulset1.Name, replicas1)) + fss.UpdateReplicas(ctx, client, statefulset1, replicas1) + + replicas2 -= 2 + ginkgo.By(fmt.Sprintf("Scaling down statefulset: %v to number of Replica: %v", statefulset2.Name, replicas2)) + fss.UpdateReplicas(ctx, client, statefulset2, replicas2) + + ginkgo.By("Bring down the secondary site") + siteFailover(ctx, false) + + defer func() { + ginkgo.By("Bring up the secondary site before terminating the test") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + siteRestore(false) + fds.hostsDown = nil + } + }() + + ginkgo.By("Wait for k8s cluster to be healthy") + if vanillaCluster { + wait4AllK8sNodesToBeUp(nodeList) + } + if vanillaCluster || guestCluster { + err = waitForAllNodes2BeReady(ctx, client, pollTimeout*4) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + // Check if csi pods are running fine after site failure + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Verifying statefulset scale up/down went fine on sts1 and sts2") + // Scale up replicas of statefulset1 and verify CNS entries for volumes + scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset1, + replicas1, false, true) + // Scale down replicas of statefulset2 and verify CNS entries for volumes + scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset2, + ss2PodsBeforeScaleDown, replicas2, false, true) + + // Scaling up statefulset sts1 + replicas1 += 2 + scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset1, + replicas1, true, false) + + // Scaling down statefulset sts2 + replicas2 -= 2 + scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset2, + ss2PodsBeforeScaleDown, replicas2, true, false) - defer func() { scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) pvcs, err := client.CoreV1().PersistentVolumeClaims(namespace).List(ctx, metav1.ListOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) @@ -2931,88 +3263,18 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f fmt.Sprintf("Volume: %s should not be present in the CNS after it is deleted from "+ "kubernetes", volumeHandle)) } - }() - // Get the list of csi pods running in CSI namespace - csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - - replicas1 += 2 - ginkgo.By(fmt.Sprintf("Scaling up statefulset %v to number of Replica: %v", statefulset1.Name, replicas1)) - fss.UpdateReplicas(ctx, client, statefulset1, replicas1) - - replicas2 -= 2 - ginkgo.By(fmt.Sprintf("Scaling down statefulset: %v to number of Replica: %v", statefulset2.Name, replicas2)) - fss.UpdateReplicas(ctx, client, statefulset2, replicas2) - - ginkgo.By("Bring down the secondary site") - siteFailover(ctx, false) - - defer func() { - ginkgo.By("Bring up the secondary site before terminating the test") + ginkgo.By("Bring up the secondary site") if len(fds.hostsDown) > 0 && fds.hostsDown != nil { siteRestore(false) fds.hostsDown = nil } - }() - ginkgo.By("Wait for k8s cluster to be healthy") - wait4AllK8sNodesToBeUp(ctx, client, nodeList) - err = waitForAllNodes2BeReady(ctx, client, pollTimeout*4) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - - // Check if csi pods are running fine after site failure - err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - - ginkgo.By("Verifying statefulset scale up/down went fine on sts1 and sts2") - // Scale up replicas of statefulset1 and verify CNS entries for volumes - scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset1, - replicas1, false, true) - // Scale down replicas of statefulset2 and verify CNS entries for volumes - scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset2, - ss2PodsBeforeScaleDown, replicas2, false, true) - - // Scaling up statefulset sts1 - replicas1 += 2 - scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset1, - replicas1, true, false) - - // Scaling down statefulset sts2 - replicas2 -= 2 - scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset2, - ss2PodsBeforeScaleDown, replicas2, true, false) - - scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) - pvcs, err := client.CoreV1().PersistentVolumeClaims(namespace).List(ctx, metav1.ListOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - for _, claim := range pvcs.Items { - pv := getPvFromClaim(client, namespace, claim.Name) - err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - ginkgo.By("Verify it's PV and corresponding volumes are deleted from CNS") - err = fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, - pollTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - volumeHandle := pv.Spec.CSI.VolumeHandle - err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) - gomega.Expect(err).NotTo(gomega.HaveOccurred(), - fmt.Sprintf("Volume: %s should not be present in the CNS after it is deleted from "+ - "kubernetes", volumeHandle)) - } - - ginkgo.By("Bring up the secondary site") - if len(fds.hostsDown) > 0 && fds.hostsDown != nil { - siteRestore(false) - fds.hostsDown = nil - } - - ginkgo.By("Wait for k8s cluster to be healthy") - // wait for the VMs to move back - err = waitForAllNodes2BeReady(ctx, client, pollTimeout*4) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - - }) + ginkgo.By("Wait for k8s cluster to be healthy") + // wait for the VMs to move back + err = waitForAllNodes2BeReady(ctx, client, pollTimeout*4) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }) /* Operation Storm: @@ -3026,214 +3288,215 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f 7. Create 50 new PVCs 8. Wait for secondary site VMs to come up and k8s to be healthy 9. Verify all stateful sets have scaled up/down successfully - 10. Scale down first 50 sts to 2 replicas - 11. Scale up second 50 statefulsets to 1 replica + 10. Scale down first 50 sts to 2 replicas. + 11. Scale up second 50 statefulsets to 1 replica. 12. Verify all stateful sets have scaled up/down successfully 13. Delete all stateful sets 14. Delete all PVCs 15. Bring secondary site up and wait for testbed to be normal */ - ginkgo.It("[distributed] Operation Storm", func() { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - ginkgo.By("Creating StorageClass for Statefulset") - // decide which test setup is available to run - ginkgo.By("CNS_TEST: Running for vanilla k8s setup") - scParameters = map[string]string{} - scParameters["StoragePolicyName"] = storagePolicyName - storageClassName = "nginx-sc-default" - var pvclaims []*v1.PersistentVolumeClaim - var pvcList []*v1.PersistentVolumeClaim - - scSpec := getVSphereStorageClassSpec(storageClassName, scParameters, nil, "", "", false) - sc, err := client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - defer func() { - err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + ginkgo.It("[distributed] Operation Storm", + ginkgo.Label(p2, vsanStretch, block, vanilla, distributed), func() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + ginkgo.By("Creating StorageClass for Statefulset") + // decide which test setup is available to run + ginkgo.By("CNS_TEST: Running for vanilla k8s setup") + scParameters = map[string]string{} + scParameters["StoragePolicyName"] = storagePolicyName + storageClassName = "nginx-sc-default" + var pvclaims []*v1.PersistentVolumeClaim + var pvcList []*v1.PersistentVolumeClaim + + scSpec := getVSphereStorageClassSpec(storageClassName, scParameters, nil, "", "", false) + sc, err := client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }() + defer func() { + err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() - ginkgo.By("Creating service") - service := CreateService(namespace, client) - defer func() { - deleteService(namespace, client, service) - }() + ginkgo.By("Creating service") + service := CreateService(namespace, client) + defer func() { + deleteService(namespace, client, service) + }() - var stsList []*appsv1.StatefulSet - var replicas1, replicas2 int32 - prefix1 := "storm1-sts-" - prefix2 := "storm2-sts-" - framework.Logf("Create %d statefulsets with prefix %s", operationStormScale, prefix1) - for i := 0; i < operationStormScale; i++ { - statefulsetName := prefix1 + strconv.Itoa(i) - framework.Logf("Creating statefulset: %s", statefulsetName) - statefulset, _, _ := createStsDeployment(ctx, client, namespace, sc, false, true, 1, statefulsetName, 1, - accessMode) - replicas1 = *(statefulset.Spec.Replicas) - stsList = append(stsList, statefulset) - } + var stsList []*appsv1.StatefulSet + var replicas1, replicas2 int32 + prefix1 := "storm1-sts-" + prefix2 := "storm2-sts-" + framework.Logf("Create %d statefulsets with prefix %s", operationStormScale, prefix1) + for i := 0; i < operationStormScale; i++ { + statefulsetName := prefix1 + strconv.Itoa(i) + framework.Logf("Creating statefulset: %s", statefulsetName) + statefulset, _, _ := createStsDeployment(ctx, client, namespace, sc, false, true, 1, statefulsetName, 1, + accessMode) + replicas1 = *(statefulset.Spec.Replicas) + stsList = append(stsList, statefulset) + } - framework.Logf("Create %d statefulsets with prefix %s", operationStormScale, prefix2) - for i := 0; i < operationStormScale; i++ { - statefulsetName := prefix2 + strconv.Itoa(i) - framework.Logf("Creating statefulset: %s", statefulsetName) - statefulset, _, _ := createStsDeployment(ctx, client, namespace, sc, false, true, 2, statefulsetName, 1, - accessMode) - replicas2 = *(statefulset.Spec.Replicas) - stsList = append(stsList, statefulset) - } + framework.Logf("Create %d statefulsets with prefix %s", operationStormScale, prefix2) + for i := 0; i < operationStormScale; i++ { + statefulsetName := prefix2 + strconv.Itoa(i) + framework.Logf("Creating statefulset: %s", statefulsetName) + statefulset, _, _ := createStsDeployment(ctx, client, namespace, sc, false, true, 2, statefulsetName, 1, + accessMode) + replicas2 = *(statefulset.Spec.Replicas) + stsList = append(stsList, statefulset) + } - defer func() { - scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) - pvcs, err := client.CoreV1().PersistentVolumeClaims(namespace).List(ctx, metav1.ListOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - for _, claim := range pvcs.Items { - pv := getPvFromClaim(client, namespace, claim.Name) - err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) + defer func() { + scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) + pvcs, err := client.CoreV1().PersistentVolumeClaims(namespace).List(ctx, metav1.ListOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - ginkgo.By("Verify it's PV and corresponding volumes are deleted from CNS") - volumeHandle := pv.Spec.CSI.VolumeHandle - err = fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, - pollTimeout) - errMsg := "The object or item referred to could not be found" - if err != nil && checkForEventWithMessage(client, "", pv.Name, errMsg) { - framework.Logf("Persistent Volume %v still not deleted with err %v", pv.Name, errMsg) - // Orphan volumes may be left over here, hence logging those PVs and ignoring the error for now. - _ = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) - framework.Logf("Volume %v still not deleted from CNS with err %v", pv.Name, errMsg) - } else { - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + for _, claim := range pvcs.Items { + pv := getPvFromClaim(client, namespace, claim.Name) + err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Verify it's PV and corresponding volumes are deleted from CNS") + volumeHandle := pv.Spec.CSI.VolumeHandle + err = fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, + pollTimeout) + errMsg := "The object or item referred to could not be found" + if err != nil && checkForEventWithMessage(client, "", pv.Name, errMsg) { + framework.Logf("Persistent Volume %v still not deleted with err %v", pv.Name, errMsg) + // Orphan volumes may be left over here, hence logging those PVs and ignoring the error for now. + _ = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + framework.Logf("Volume %v still not deleted from CNS with err %v", pv.Name, errMsg) + } else { + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } } + }() + + for i := 0; i < operationStormScale; i++ { + framework.Logf("Creating pvc") + pvc, err := createPVC(ctx, client, namespace, nil, diskSize, sc, "") + pvclaims = append(pvclaims, pvc) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) } - }() - for i := 0; i < operationStormScale; i++ { - framework.Logf("Creating pvc") - pvc, err := createPVC(ctx, client, namespace, nil, diskSize, sc, "") - pvclaims = append(pvclaims, pvc) + ginkgo.By("Wait for PVCs to be in bound state") + persistentvolumes, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaims, framework.ClaimProvisionTimeout) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } - - ginkgo.By("Wait for PVCs to be in bound state") - persistentvolumes, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaims, framework.ClaimProvisionTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - for i := 0; i < volumeOpsScale; i++ { - volHandle := persistentvolumes[i].Spec.CSI.VolumeHandle - gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) - } + for i := 0; i < volumeOpsScale; i++ { + volHandle := persistentvolumes[i].Spec.CSI.VolumeHandle + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + } - // Get the list of csi pods running in CSI namespace - csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + // Get the list of csi pods running in CSI namespace + csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Bring down the secondary site while scaling statfulsets and creation/deletion of PVCs") + replicas1 += 2 + replicas2 -= 2 + var wg sync.WaitGroup + ch := make(chan *v1.PersistentVolumeClaim) + lock := &sync.Mutex{} + wg.Add(5) + go scaleStsReplicaInParallel(ctx, client, stsList, prefix1, replicas1, &wg) + go scaleStsReplicaInParallel(ctx, client, stsList, prefix2, replicas2, &wg) + go deletePvcInParallel(ctx, client, pvclaims, namespace, &wg) + go createPvcInParallel(ctx, client, namespace, diskSize, sc, ch, lock, &wg, operationStormScale) + go func() { + for v := range ch { + pvcList = append(pvcList, v) + } + }() - ginkgo.By("Bring down the secondary site while scaling statfulsets and creation/deletion of PVCs") - replicas1 += 2 - replicas2 -= 2 - var wg sync.WaitGroup - ch := make(chan *v1.PersistentVolumeClaim) - lock := &sync.Mutex{} - wg.Add(5) - go scaleStsReplicaInParallel(ctx, client, stsList, prefix1, replicas1, &wg) - go scaleStsReplicaInParallel(ctx, client, stsList, prefix2, replicas2, &wg) - go deletePvcInParallel(ctx, client, pvclaims, namespace, &wg) - go createPvcInParallel(ctx, client, namespace, diskSize, sc, ch, lock, &wg, operationStormScale) - go func() { - for v := range ch { - pvcList = append(pvcList, v) - } - }() - framework.Logf("ch is %v", ch) - go siteFailureInParallel(ctx, false, &wg) - wg.Wait() - close(ch) + go siteFailureInParallel(ctx, false, &wg) + wg.Wait() + close(ch) - ginkgo.By("Wait for k8s cluster to be healthy") - wait4AllK8sNodesToBeUp(ctx, client, nodeList) - err = waitForAllNodes2BeReady(ctx, client, pollTimeout*4) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Wait for k8s cluster to be healthy") + wait4AllK8sNodesToBeUp(nodeList) + err = waitForAllNodes2BeReady(ctx, client, pollTimeout*4) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) - // Check if csi pods are running fine after site failure - err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + // Check if csi pods are running fine after site failure + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) - framework.Logf("pvcList is %v", pvcList) + framework.Logf("pvcList is %v", pvcList) - // Scale down replicas of statefulset2 and verify CNS entries for volumes + // Scale down replicas of statefulset2 and verify CNS entries for volumes - // Waiting for pods status to be Ready and have scaled properly - framework.Logf("Waiting for statefulset pod status with prefix %s to be Ready and have "+ - "scaled properly to replica %d", prefix1, replicas2) - for _, statefulset := range stsList { - if strings.Contains(statefulset.Name, prefix1) { - scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset, - replicas1, false, false) + // Waiting for pods status to be Ready and have scaled properly + framework.Logf("Waiting for statefulset pod status with prefix %s to be Ready and have "+ + "scaled properly to replica %d", prefix1, replicas2) + for _, statefulset := range stsList { + if strings.Contains(statefulset.Name, prefix1) { + scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset, + replicas1, false, false) + } } - } - framework.Logf("Waiting for statefulset pod status with prefix %s to be Ready and have"+ - "scaled properly to replica %d", prefix2, replicas2) - for _, statefulset := range stsList { - if strings.Contains(statefulset.Name, prefix2) { - fss.WaitForStatusReadyReplicas(ctx, client, statefulset, replicas2) + framework.Logf("Waiting for statefulset pod status with prefix %s to be Ready and have"+ + "scaled properly to replica %d", prefix2, replicas2) + for _, statefulset := range stsList { + if strings.Contains(statefulset.Name, prefix2) { + fss.WaitForStatusReadyReplicas(ctx, client, statefulset, replicas2) + } } - } - ginkgo.By("Wait for PVCs to be in bound state") - persistentvolumes, err = fpv.WaitForPVClaimBoundPhase(ctx, client, pvcList, framework.ClaimProvisionTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - for i := 0; i < volumeOpsScale; i++ { - volHandle := persistentvolumes[i].Spec.CSI.VolumeHandle - gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) - } + ginkgo.By("Wait for PVCs to be in bound state") + persistentvolumes, err = fpv.WaitForPVClaimBoundPhase(ctx, client, pvcList, framework.ClaimProvisionTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for i := 0; i < volumeOpsScale; i++ { + volHandle := persistentvolumes[i].Spec.CSI.VolumeHandle + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + } - replicas1 -= 1 - replicas2 += 1 + replicas1 -= 1 + replicas2 += 1 - framework.Logf("Scaling statefulset pod replicas with prefix %s to"+ - "%d number of replicas", prefix1, replicas1) - for _, statefulset := range stsList { - if strings.Contains(statefulset.Name, prefix1) { - fss.UpdateReplicas(ctx, client, statefulset, replicas1) + framework.Logf("Scaling statefulset pod replicas with prefix %s to"+ + "%d number of replicas", prefix1, replicas1) + for _, statefulset := range stsList { + if strings.Contains(statefulset.Name, prefix1) { + fss.UpdateReplicas(ctx, client, statefulset, replicas1) + } } - } - framework.Logf("Scaling statefulset pod replicas with prefix %s to"+ - "%d number of replicas", prefix1, replicas1) - for _, statefulset := range stsList { - if strings.Contains(statefulset.Name, prefix2) { - fss.UpdateReplicas(ctx, client, statefulset, replicas2) + framework.Logf("Scaling statefulset pod replicas with prefix %s to"+ + "%d number of replicas", prefix1, replicas1) + for _, statefulset := range stsList { + if strings.Contains(statefulset.Name, prefix2) { + fss.UpdateReplicas(ctx, client, statefulset, replicas2) + } } - } - framework.Logf("Waiting for statefulset pod status with prefix %s to be Ready and have"+ - "scaled properly to replica %d", prefix1, replicas1) - for _, statefulset := range stsList { - if strings.Contains(statefulset.Name, prefix1) { - scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset, - nil, replicas1, false, false) + framework.Logf("Waiting for statefulset pod status with prefix %s to be Ready and have"+ + "scaled properly to replica %d", prefix1, replicas1) + for _, statefulset := range stsList { + if strings.Contains(statefulset.Name, prefix1) { + scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset, + nil, replicas1, false, false) + } } - } - framework.Logf("Waiting for statefulset pod status with prefix %s to be Ready and have"+ - "scaled properly to replica %d", prefix2, replicas2) - for _, statefulset := range stsList { - if strings.Contains(statefulset.Name, prefix2) { - scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset, - replicas2, false, false) + framework.Logf("Waiting for statefulset pod status with prefix %s to be Ready and have"+ + "scaled properly to replica %d", prefix2, replicas2) + for _, statefulset := range stsList { + if strings.Contains(statefulset.Name, prefix2) { + scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset, + replicas2, false, false) + } } - } - scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) + scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) - ginkgo.By("Bring up the secondary site") - siteRestore(false) + ginkgo.By("Bring up the secondary site") + siteRestore(false) - ginkgo.By("Wait for k8s cluster to be healthy") - // wait for the VMs to move back - err = waitForAllNodes2BeReady(ctx, client, pollTimeout*4) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Wait for k8s cluster to be healthy") + // wait for the VMs to move back + err = waitForAllNodes2BeReady(ctx, client, pollTimeout*4) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }) - }) /* Partial failure of secondary site Steps: @@ -3241,118 +3504,118 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f 2. Create a statefulset, deployment with volumes from the stretched datastore 3. Bring down a esx server in the secondary site 4. Verify that the VMs on the esx server which was brought down are started up on the - other esx servers in the secondary site + other esx servers in the secondary site. 5. Verify that the k8s cluster is healthy and all the k8s constructs created in step 2 are running and volume and application lifecycle actions work fine 6. Restore secondary site back up and wait for testbed to be back to normal 7. Delete all objects created in step 2 and 5 */ - ginkgo.It("[control-plane-on-primary] Partial failure of secondary site", func() { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - ginkgo.By("Creating StorageClass for Statefulset") - // decide which test setup is available to run - ginkgo.By("CNS_TEST: Running for vanilla k8s setup") - scParameters = map[string]string{} - scParameters["StoragePolicyName"] = storagePolicyName - storageClassName = "nginx-sc-default" - - scSpec := getVSphereStorageClassSpec(storageClassName, scParameters, nil, "", "", false) - sc, err := client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - defer func() { - err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }() + ginkgo.It("Partial failure of secondary site", + ginkgo.Label(p0, vsanStretch, block, vanilla, wcp, tkg, controlPlaneOnPrimary), func() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + ginkgo.By("Creating StorageClass for Statefulset") + // decide which test setup is available to run + ginkgo.By("CNS_TEST: Running for vanilla k8s setup") + scParameters = map[string]string{} + scParameters["StoragePolicyName"] = storagePolicyName + storageClassName = "nginx-sc-default" - ginkgo.By("Creating service") - service := CreateService(namespace, client) - defer func() { - deleteService(namespace, client, service) - }() + scSpec := getVSphereStorageClassSpec(storageClassName, scParameters, nil, "", "", false) + sc, err := client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + defer func() { + err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() - ginkgo.By("Creating statefulset and deployment with volumes from the stretched datastore") - statefulset, _, _ := createStsDeployment(ctx, client, namespace, sc, true, - false, 3, "", 1, accessMode) + ginkgo.By("Creating service") + service := CreateService(namespace, client) + defer func() { + deleteService(namespace, client, service) + }() - ssPodsBeforeScaleDown := fss.GetPodList(ctx, client, statefulset) - replicas := *(statefulset.Spec.Replicas) - csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Creating statefulset and deployment with volumes from the stretched datastore") + statefulset, _, _ := createStsDeployment(ctx, client, namespace, sc, true, + false, 3, "", 1, accessMode) - defer func() { - scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) - pvcs, err := client.CoreV1().PersistentVolumeClaims(namespace).List(ctx, metav1.ListOptions{}) + ssPodsBeforeScaleDown := fss.GetPodList(ctx, client, statefulset) + replicas := *(statefulset.Spec.Replicas) + csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - for _, claim := range pvcs.Items { - pv := getPvFromClaim(client, namespace, claim.Name) - err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) + + defer func() { + scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) + pvcs, err := client.CoreV1().PersistentVolumeClaims(namespace).List(ctx, metav1.ListOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - ginkgo.By("Verify it's PV and corresponding volumes are deleted from CNS") - volumeHandle := pv.Spec.CSI.VolumeHandle - err = fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, - pollTimeout) - errMsg := "The object or item referred to could not be found" - if err != nil && checkForEventWithMessage(client, "", pv.Name, errMsg) { - framework.Logf("Persistent Volume %v still not deleted with err %v", pv.Name, errMsg) - // Orphan volumes may be left over here, hence logging those PVs and ignoring the error for now. - _ = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) - framework.Logf("Volume %v still not deleted from CNS with err %v", pv.Name, errMsg) - } else { - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + for _, claim := range pvcs.Items { + pv := getPvFromClaim(client, namespace, claim.Name) + err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Verify it's PV and corresponding volumes are deleted from CNS") + volumeHandle := pv.Spec.CSI.VolumeHandle + err = fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, + pollTimeout) + errMsg := "The object or item referred to could not be found" + if err != nil && checkForEventWithMessage(client, "", pv.Name, errMsg) { + framework.Logf("Persistent Volume %v still not deleted with err %v", pv.Name, errMsg) + // Orphan volumes may be left over here, hence logging those PVs and ignoring the error for now. + _ = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + framework.Logf("Volume %v still not deleted from CNS with err %v", pv.Name, errMsg) + } else { + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } } - } - }() + }() - ginkgo.By("Bring down a host in secondary site") - rand.New(rand.NewSource(time.Now().UnixNano())) - max, min := 3, 0 - randomValue := rand.Intn(max-min) + min - host := fds.secondarySiteHosts[randomValue] - hostFailure(ctx, host, true) + ginkgo.By("Bring down a host in secondary site") + rand.New(rand.NewSource(time.Now().UnixNano())) + max, min := 3, 0 + randomValue := rand.Intn(max-min) + min + host := fds.secondarySiteHosts[randomValue] + hostFailure(ctx, host, true) + + defer func() { + ginkgo.By("Bring up host in secondary site") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + hostFailure(ctx, host, false) + fds.hostsDown = nil + } + }() + + ginkgo.By("Wait for k8s cluster to be healthy") + wait4AllK8sNodesToBeUp(nodeList) + err = waitForAllNodes2BeReady(ctx, client, pollTimeout*4) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + // Check if csi pods are running fine after site failure + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Verifying volume lifecycle actions works fine") + volumeLifecycleActions(ctx, client, namespace, sc, "") + // Scale down replicas of statefulset and verify CNS entries for volumes + scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset, + ssPodsBeforeScaleDown, replicas-1, true, true) + // Scale up replicas of statefulset and verify CNS entries for volumes + scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset, + replicas, true, true) - defer func() { ginkgo.By("Bring up host in secondary site") if len(fds.hostsDown) > 0 && fds.hostsDown != nil { hostFailure(ctx, host, false) fds.hostsDown = nil } - }() - - ginkgo.By("Wait for k8s cluster to be healthy") - wait4AllK8sNodesToBeUp(ctx, client, nodeList) - err = waitForAllNodes2BeReady(ctx, client, pollTimeout*4) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - // Check if csi pods are running fine after site failure - err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - - ginkgo.By("Verifying volume lifecycle actions works fine") - volumeLifecycleActions(ctx, client, namespace, sc, "") - // Scale down replicas of statefulset and verify CNS entries for volumes - scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset, - ssPodsBeforeScaleDown, replicas-1, true, true) - // Scale up replicas of statefulset and verify CNS entries for volumes - scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset, - replicas, true, true) - - ginkgo.By("Bring up host in secondary site") - if len(fds.hostsDown) > 0 && fds.hostsDown != nil { - hostFailure(ctx, host, false) - fds.hostsDown = nil - } - - ginkgo.By("Wait for k8s cluster to be healthy") - // wait for the VMs to move back - err = waitForAllNodes2BeReady(ctx, client, pollTimeout*4) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - - scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) + ginkgo.By("Wait for k8s cluster to be healthy") + // wait for the VMs to move back + err = waitForAllNodes2BeReady(ctx, client, pollTimeout*4) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }) + scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) + }) /* PV/PVC with Retain reclaim policy deletion while secondary site goes down @@ -3369,131 +3632,132 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f 9. Bring secondary site up and wait for testbed to be back to normal */ - ginkgo.It("[distributed] PV/PVC with Retain reclaim policy deletion while secondary site goes down "+ - "and csi provisioner and csi-syncer leaders are in secondary site", func() { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - ginkgo.By("Creating StorageClass") - // decide which test setup is available to run - ginkgo.By("CNS_TEST: Running for vanilla k8s setup") - scParameters = map[string]string{} - scParameters["StoragePolicyName"] = storagePolicyName - storageClassName = "nginx-sc-retain" - var pvclaims, pvcs []*v1.PersistentVolumeClaim - var pvs []*v1.PersistentVolume - - sc, err := createStorageClass(client, scParameters, nil, v1.PersistentVolumeReclaimRetain, "", false, "") - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - - framework.Logf("Ensuring %s leader is in secondary site", provisionerContainerName) - err = changeLeaderOfContainerToComeUpOnMaster(ctx, client, sshClientConfig, provisionerContainerName, false) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.It("PV/PVC with Retain reclaim policy deletion while secondary site goes down "+ + "and csi provisioner and csi-syncer leaders are in secondary site", + ginkgo.Label(p0, vsanStretch, block, vanilla, distributed), func() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + ginkgo.By("Creating StorageClass") + // decide which test setup is available to run + ginkgo.By("CNS_TEST: Running for vanilla k8s setup") + scParameters = map[string]string{} + scParameters["StoragePolicyName"] = storagePolicyName + storageClassName = "nginx-sc-retain" + var pvclaims, pvcs []*v1.PersistentVolumeClaim + var pvs []*v1.PersistentVolume - framework.Logf("Ensuring %s leader is in secondary site", syncerContainerName) - err = changeLeaderOfContainerToComeUpOnMaster(ctx, client, sshClientConfig, syncerContainerName, false) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + sc, err := createStorageClass(client, scParameters, nil, v1.PersistentVolumeReclaimRetain, "", false, "") + gomega.Expect(err).NotTo(gomega.HaveOccurred()) - for i := 0; i < volumeOpsScale; i++ { - framework.Logf("Creating pvc %v with reclaim policy Retain", i) - pvc, err := createPVC(ctx, client, namespace, nil, diskSize, sc, "") - pvclaims = append(pvclaims, pvc) + framework.Logf("Ensuring %s leader is in secondary site", provisionerContainerName) + err = changeLeaderOfContainerToComeUpOnMaster(ctx, client, sshClientConfig, provisionerContainerName, false) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } - persistentvolumes, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaims, framework.ClaimProvisionTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - for i := 0; i < volumeOpsScale; i++ { - volHandle := persistentvolumes[i].Spec.CSI.VolumeHandle - gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) - } + framework.Logf("Ensuring %s leader is in secondary site", syncerContainerName) + err = changeLeaderOfContainerToComeUpOnMaster(ctx, client, sshClientConfig, syncerContainerName, false) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) - defer func() { - for _, claim := range pvclaims { - err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) + for i := 0; i < volumeOpsScale; i++ { + framework.Logf("Creating pvc %v with reclaim policy Retain", i) + pvc, err := createPVC(ctx, client, namespace, nil, diskSize, sc, "") + pvclaims = append(pvclaims, pvc) gomega.Expect(err).NotTo(gomega.HaveOccurred()) } - for _, pv := range persistentvolumes { - err = fpv.DeletePersistentVolume(ctx, client, pv.Name) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - err := fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, framework.Poll, - framework.PodDeleteTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - volumeHandle := pv.Spec.CSI.VolumeHandle - err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - ginkgo.By(fmt.Sprintf("Deleting FCD: %s", volumeHandle)) - err = e2eVSphere.deleteFCD(ctx, volumeHandle, defaultDatastore.Reference()) + + persistentvolumes, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaims, framework.ClaimProvisionTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for i := 0; i < volumeOpsScale; i++ { + volHandle := persistentvolumes[i].Spec.CSI.VolumeHandle + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + } + + defer func() { + for _, claim := range pvclaims { + err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + for _, pv := range persistentvolumes { + err = fpv.DeletePersistentVolume(ctx, client, pv.Name) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + err := fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, framework.Poll, + framework.PodDeleteTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + volumeHandle := pv.Spec.CSI.VolumeHandle + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By(fmt.Sprintf("Deleting FCD: %s", volumeHandle)) + err = e2eVSphere.deleteFCD(ctx, volumeHandle, defaultDatastore.Reference()) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + }() + + for i := 0; i < volumeOpsScale/2; i++ { + claim := pvclaims[i] + pv := getPvFromClaim(client, namespace, claim.Name) + pvs = append(pvs, pv) + err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) gomega.Expect(err).NotTo(gomega.HaveOccurred()) } - }() + for i := volumeOpsScale / 2; i < volumeOpsScale; i++ { + pvcs = append(pvcs, pvclaims[i]) + } - for i := 0; i < volumeOpsScale/2; i++ { - claim := pvclaims[i] - pv := getPvFromClaim(client, namespace, claim.Name) - pvs = append(pvs, pv) - err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) + // Get the list of csi pods running in CSI namespace + csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } - for i := volumeOpsScale / 2; i < volumeOpsScale; i++ { - pvcs = append(pvcs, pvclaims[i]) - } + ginkgo.By("Bring down the secondary site while deleting pv") + var wg sync.WaitGroup + wg.Add(3) + go deletePvcInParallel(ctx, client, pvcs, namespace, &wg) + go deletePvInParallel(ctx, client, pvs, &wg) + go siteFailureInParallel(ctx, false, &wg) + wg.Wait() - // Get the list of csi pods running in CSI namespace - csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + defer func() { + ginkgo.By("Bring up the secondary site before terminating the test") + if len(fds.hostsDown) > 0 { + siteRestore(false) + fds.hostsDown = []string{} + } + }() + + ginkgo.By("Wait for k8s cluster to be healthy") + wait4AllK8sNodesToBeUp(nodeList) + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) - ginkgo.By("Bring down the secondary site while deleting pv") - var wg sync.WaitGroup - wg.Add(3) - go deletePvcInParallel(ctx, client, pvcs, namespace, &wg) - go deletePvInParallel(ctx, client, pvs, &wg) - go siteFailureInParallel(ctx, false, &wg) - wg.Wait() + // Check if csi pods are running fine after site failure + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout*2) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) - defer func() { - ginkgo.By("Bring up the secondary site before terminating the test") + ginkgo.By("Verify PVs, volumes are deleted from CNS") + for _, pv := range pvs { + err := fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, + pollTimeout) + framework.Logf("Persistent Volume %v still not deleted with err %v", pv.Name, err) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + volumeHandle := pv.Spec.CSI.VolumeHandle + // Orphan volumes may be left over here, hence logging those PVs and ignoring the error for now. + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + framework.Logf("Volume %v still not deleted from CNS with err %v", pv.Name, err) + } + + ginkgo.By("Bring up the secondary site") if len(fds.hostsDown) > 0 { siteRestore(false) fds.hostsDown = []string{} } - }() - ginkgo.By("Wait for k8s cluster to be healthy") - wait4AllK8sNodesToBeUp(ctx, client, nodeList) - err = waitForAllNodes2BeReady(ctx, client) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - - // Check if csi pods are running fine after site failure - err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout*2) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + siteRestore(true) - ginkgo.By("Verify PVs, volumes are deleted from CNS") - for _, pv := range pvs { - err := fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, - pollTimeout) - framework.Logf("Persistent Volume %v still not deleted with err %v", pv.Name, err) + ginkgo.By("Wait for k8s cluster to be healthy") + // wait for the VMs to move back + err = waitForAllNodes2BeReady(ctx, client) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - volumeHandle := pv.Spec.CSI.VolumeHandle - // Orphan volumes may be left over here, hence logging those PVs and ignoring the error for now. - err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) - framework.Logf("Volume %v still not deleted from CNS with err %v", pv.Name, err) - } - - ginkgo.By("Bring up the secondary site") - if len(fds.hostsDown) > 0 { - siteRestore(false) - fds.hostsDown = []string{} - } - - siteRestore(true) - - ginkgo.By("Wait for k8s cluster to be healthy") - // wait for the VMs to move back - err = waitForAllNodes2BeReady(ctx, client) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }) + }) /* Static PV/PVC creation while secondary site goes down and csi-syncer leader is in secondary site @@ -3512,465 +3776,620 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f */ ginkgo.It("[distributed] Static PV/PVC creation while secondary site goes down"+ - " and csi-syncer leader is in secondary site", func() { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - ginkgo.By("Creating StorageClass") - // decide which test setup is available to run - ginkgo.By("CNS_TEST: Running for vanilla k8s setup") - scParameters = map[string]string{} - scParameters["StoragePolicyName"] = storageThickPolicyName - storageClassName = "nginx-sc-thick" - var pvclaims []*v1.PersistentVolumeClaim - var fcdIDs []string - - scSpec := getVSphereStorageClassSpec(storageClassName, scParameters, nil, "", "", false) - sc, err := client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - defer func() { - err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }() - - framework.Logf("Ensuring %s leader is in secondary site", syncerContainerName) - err = changeLeaderOfContainerToComeUpOnMaster(ctx, client, sshClientConfig, syncerContainerName, false) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - - for i := 0; i < volumeOpsScale; i++ { - ginkgo.By("Creating FCD Disk") - fcdID, err := e2eVSphere.createFCD(ctx, "FCD"+strconv.Itoa(i), diskSizeInMb, defaultDatastore.Reference()) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - fcdIDs = append(fcdIDs, fcdID) - } - - ginkgo.By(fmt.Sprintf("Sleeping for %v seconds to allow newly created FCDs to sync with pandora", - pandoraSyncWaitTime)) - time.Sleep(time.Duration(pandoraSyncWaitTime) * time.Second) - - // Get the list of csi pods running in CSI namespace - csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + " and csi-syncer leader is in secondary site", + ginkgo.Label(p0, vsanStretch, block, vanilla, distributed), func() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + ginkgo.By("Creating StorageClass") + var pvclaims []*v1.PersistentVolumeClaim + var fcdIDs []string + var svcCsipods *v1.PodList + var svcClient clientset.Interface + var svNamespace string + var svcPVCNames []string + var deleteFCD bool = true + //namespace = "default" + framework.Logf("namespace: %s", namespace) + + if vanillaCluster { + scParameters = map[string]string{} + scParameters["StoragePolicyName"] = storageThickPolicyName + scSpec := getVSphereStorageClassSpec(defaultNginxStorageClassName, scParameters, nil, "", "", false) + sc, err = client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + defer func() { + err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + } else { + svcClient, svNamespace = getSvcClientAndNamespace() + sc, err = client.StorageV1().StorageClasses().Get(ctx, storagePolicyName, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) - ginkgo.By("Bring down the secondary site while creating static pv and pvcs") - var wg sync.WaitGroup - ch := make(chan *v1.PersistentVolumeClaim) - wg.Add(2) - go createStaticPvAndPvcInParallel(client, ctx, fcdIDs, ch, namespace, &wg, volumeOpsScale) - go func() { - for v := range ch { - pvclaims = append(pvclaims, v) } - }() - go siteFailureInParallel(ctx, false, &wg) - wg.Wait() - close(ch) - defer func() { - pvcs, err := client.CoreV1().PersistentVolumeClaims(namespace).List(ctx, metav1.ListOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - for _, claim := range pvcs.Items { - pv := getPvFromClaim(client, namespace, claim.Name) - err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - ginkgo.By("Verify it's PV and corresponding volumes are deleted from CNS") - err = fpv.DeletePersistentVolume(ctx, client, pv.Name) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - err = fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, - pollTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - volumeHandle := pv.Spec.CSI.VolumeHandle - err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + if vanillaCluster { + framework.Logf("Ensuring %s leader is in secondary site", syncerContainerName) + err = changeLeaderOfContainerToComeUpOnMaster(ctx, client, sshClientConfig, syncerContainerName, false) gomega.Expect(err).NotTo(gomega.HaveOccurred()) } - }() - ginkgo.By("Wait for k8s cluster to be healthy") - wait4AllK8sNodesToBeUp(ctx, client, nodeList) - err = waitForAllNodes2BeReady(ctx, client) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - - // Check if csi pods are running fine after site failure - err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout*2) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - - persistentvolumes, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaims, framework.ClaimProvisionTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - for i := 0; i < volumeOpsScale; i++ { - volHandle := persistentvolumes[i].Spec.CSI.VolumeHandle - gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) - framework.Logf("Verifying CNS entry is present in cache for pv: %s", persistentvolumes[i].Name) - _, err = e2eVSphere.queryCNSVolumeWithResult(volHandle) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } + for i := 0; i < volumeOpsScale; i++ { + ginkgo.By("Creating FCD Disk") + fcdID, err := e2eVSphere.createFCD(ctx, "FCD"+strconv.Itoa(i), diskSizeInMb, defaultDatastore.Reference()) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + fcdIDs = append(fcdIDs, fcdID) + } - for _, pvclaim := range pvclaims { - err = fpv.DeletePersistentVolumeClaim(ctx, client, pvclaim.Name, namespace) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } - ginkgo.By("Verify PVs, volumes are deleted from CNS") - for _, pv := range persistentvolumes { - err := fpv.DeletePersistentVolume(ctx, client, pv.Name) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - err = fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, - pollTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - volumeHandle := pv.Spec.CSI.VolumeHandle - err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } + defer func() { + if deleteFCD { + for _, fcdID := range fcdIDs { + ginkgo.By(fmt.Sprintf("Deleting FCD: %s", fcdID)) - for _, fcdId := range fcdIDs { - ginkgo.By(fmt.Sprintf("Deleting FCD: %s", fcdId)) - err := e2eVSphere.deleteFCD(ctx, fcdId, defaultDatastore.Reference()) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } + err := e2eVSphere.deleteFCD(ctx, fcdID, defaultDatastore.Reference()) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + } + }() - ginkgo.By("Bring up the secondary site") - siteRestore(false) + ginkgo.By(fmt.Sprintf("Sleeping for %v seconds to allow newly created FCDs to sync with pandora", + pandoraSyncWaitTime)) + time.Sleep(time.Duration(300) * time.Second) - ginkgo.By("Wait for k8s cluster to be healthy") - // wait for the VMs to move back - err = waitForAllNodes2BeReady(ctx, client) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }) + if guestCluster { + var restConfig *restclient.Config + if k8senv := GetAndExpectStringEnvVar("SUPERVISOR_CLUSTER_KUBE_CONFIG"); k8senv != "" { + restConfig, err = clientcmd.BuildConfigFromFlags("", k8senv) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + for _, fcdID := range fcdIDs { + ginkgo.By("Create CNS register volume with above created FCD") + rand.New(rand.NewSource(time.Now().UnixNano())) + suffix := fmt.Sprintf("-%v-%v", time.Now().UnixNano(), rand.Intn(10000)) + + svpvcName := "cns-pvc-" + suffix + framework.Logf("pvc name :%s", svpvcName) + cnsRegisterVolume := getCNSRegisterVolumeSpec(ctx, svNamespace, fcdID, "", svpvcName, v1.ReadWriteOnce) + err = createCNSRegisterVolume(ctx, restConfig, cnsRegisterVolume) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + framework.ExpectNoError(waitForCNSRegisterVolumeToGetCreated(ctx, + restConfig, namespace, cnsRegisterVolume, poll, supervisorClusterOperationsTimeout*2)) + cnsRegisterVolumeName := cnsRegisterVolume.GetName() + framework.Logf("CNS register volume name : %s", cnsRegisterVolumeName) - /* - Site failover during full sync - Steps: - 1. Configure a vanilla multi-master K8s cluster with inter and intra site replication - 2. Create 6 PVCs with reclaim policy Delete and 8 with reclaim policy Retain - and wait for them to be bound - 3. Delete four PVCs with reclaim policy Retain - 4. Delete two PVs reclaim policy Retain related to PVC used in step 3 - 5. Create two pods using PVCs with reclaim policy Delete - 6. Bring vsan-health service down - 7. Create two pods with two PVCs each - 8. Create two static PVs with disk left after step 4 - 9. Create two PVCs to bind to PVs with reclaim policy Retain - 10. Delete four PVCs with reclaim policy Retain different from the ones used in step 3 and 9 - 11. Delete two PVs reclaim policy Retain related to PVC used in step 10 - 12. Add labels to all PVs, PVCs - 13. Bring vsan-health service up when full sync is triggered - 14. Bring down primary site - 15. Verify that the VMs on the primary site are started up on the other esx servers - in the secondary site - 16. Wait for full sync - 17. Verify CNS entries - 18. Delete all pods, PVCs and PVs - 19. Bring primary site up and wait for testbed to be back to normal + ginkgo.By("verify created PV, PVC and check the bidirectional reference") + svcPVC, err := svcClient.CoreV1().PersistentVolumeClaims(svNamespace).Get(ctx, svpvcName, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + svcPV := getPvFromClaim(svcClient, svNamespace, svpvcName) + verifyBidirectionalReferenceOfPVandPVC(ctx, svcClient, svcPVC, svcPV, fcdID) + svcPVCNames = append(svcPVCNames, svpvcName) + + defer func() { + for _, svpvcName := range svcPVCNames { + svcPV := getPvFromClaim(svcClient, namespace, svpvcName) + err := fpv.DeletePersistentVolumeClaim(ctx, svcClient, svpvcName, svNamespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Verify it's PV and corresponding volumes are deleted from CNS") + err = fpv.DeletePersistentVolume(ctx, svcClient, svcPV.Name) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + err = fpv.WaitForPersistentVolumeDeleted(ctx, svcClient, svcPV.Name, poll, + pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + volumeHandle := svcPV.Spec.CSI.VolumeHandle + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + }() + } + } - */ - ginkgo.It("[primary-centric][distributed] Primary site failover during full sync when syncer"+ - " pod leader is in primary site", func() { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - ginkgo.By("Creating StorageClass") - // decide which test setup is available to run - ginkgo.By("CNS_TEST: Running for vanilla k8s setup") - scParameters = map[string]string{} - scParameters["StoragePolicyName"] = storagePolicyName - storageClassName = "nginx-sc-delete" - var pods []*v1.Pod - var pvclaimsWithDelete, pvclaimsWithRetain []*v1.PersistentVolumeClaim - var volHandles []string - - framework.Logf("Ensuring %s leader is in primary site", syncerContainerName) - err := changeLeaderOfContainerToComeUpOnMaster(ctx, client, sshClientConfig, syncerContainerName, true) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + if guestCluster { + svcCsipods, err = svcClient.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + // Get the list of csi pods running in CSI namespace + csipods, err := client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Bring down the secondary site while creating static pv and pvcs") + var wg sync.WaitGroup + ch := make(chan *v1.PersistentVolumeClaim) + lock := &sync.Mutex{} + wg.Add(2) + if vanillaCluster { + go createStaticPvAndPvcInParallel(client, ctx, fcdIDs, ch, namespace, &wg, volumeOpsScale) + } else if guestCluster { + go createStaticPvAndPvcInGuestClusterInParallel(client, ctx, namespace, + svcPVCNames, sc.Name, ch, lock, &wg) + } + go func() { + for v := range ch { + pvclaims = append(pvclaims, v) + } + }() + go siteFailureInParallel(ctx, false, &wg) + wg.Wait() + close(ch) - scRetain, err := createStorageClass(client, scParameters, nil, v1.PersistentVolumeReclaimRetain, "", false, "") - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + defer func() { + pvcs, err := client.CoreV1().PersistentVolumeClaims(namespace).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for _, claim := range pvcs.Items { + pv := getPvFromClaim(client, namespace, claim.Name) + err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Verify it's PV and corresponding volumes are deleted from CNS") + err = fpv.DeletePersistentVolume(ctx, client, pv.Name) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + err = fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, + pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + volumeHandle := pv.Spec.CSI.VolumeHandle + if guestCluster { + volumeHandle = getVolumeIDFromSupervisorCluster(pv.Spec.CSI.VolumeHandle) + gomega.Expect(volumeHandle).NotTo(gomega.BeEmpty()) + } + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + }() - scSpec := getVSphereStorageClassSpec(storageClassName, scParameters, nil, "", "", false) - scDelete, err := client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - defer func() { - err := client.StorageV1().StorageClasses().Delete(ctx, scRetain.Name, *metav1.NewDeleteOptions(0)) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - err = client.StorageV1().StorageClasses().Delete(ctx, scDelete.Name, *metav1.NewDeleteOptions(0)) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }() + if vanillaCluster { + wait4AllK8sNodesToBeUp(nodeList) + } + if guestCluster || vanillaCluster { + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } - for i := 0; i < 6; i++ { - framework.Logf("Creating pvc %v with reclaim policy Delete", i) - pvc, err := createPVC(ctx, client, namespace, nil, diskSize, scDelete, "") - pvclaimsWithDelete = append(pvclaimsWithDelete, pvc) + time.Sleep(5 * time.Minute) + if guestCluster { + ginkgo.By("Check if csi pods are running fine after site failure in supervisor") + // Check if csi pods are running fine after site failure + err = fpod.WaitForPodsRunningReady(ctx, svcClient, csiNs, int32(svcCsipods.Size()), 0, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + // Check if csi pods are running fine after site failure + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout*2) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } - for i := 0; i < 8; i++ { - framework.Logf("Creating pvc %v with reclaim policy Retain", i) - pvc, err := createPVC(ctx, client, namespace, nil, diskSize, scRetain, "") - pvclaimsWithRetain = append(pvclaimsWithRetain, pvc) + persistentvolumes, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaims, framework.ClaimProvisionTimeout) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } - - persistentvolumesRetain, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaimsWithRetain, - framework.ClaimProvisionTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - for i := 0; i < 8; i++ { - volHandle := persistentvolumesRetain[i].Spec.CSI.VolumeHandle - gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) - } - - persistentvolumesDelete, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaimsWithDelete, - framework.ClaimProvisionTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - for i := 0; i < 6; i++ { - volHandle := persistentvolumesDelete[i].Spec.CSI.VolumeHandle - gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) - } - - defer func() { - for _, claim := range pvclaimsWithDelete { - err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) + for i := 0; i < volumeOpsScale; i++ { + volHandle := persistentvolumes[i].Spec.CSI.VolumeHandle + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + framework.Logf("Verifying CNS entry is present in cache for pv: %s", persistentvolumes[i].Name) + _, err = e2eVSphere.queryCNSVolumeWithResult(volHandle) gomega.Expect(err).NotTo(gomega.HaveOccurred()) } - for _, claim := range pvclaimsWithRetain { - err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) + + for _, pvclaim := range pvclaims { + err = fpv.DeletePersistentVolumeClaim(ctx, client, pvclaim.Name, namespace) gomega.Expect(err).NotTo(gomega.HaveOccurred()) } ginkgo.By("Verify PVs, volumes are deleted from CNS") - for _, pv := range persistentvolumesDelete { - err := fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, framework.Poll, - framework.PodDeleteTimeout) + for _, pv := range persistentvolumes { + err := fpv.DeletePersistentVolume(ctx, client, pv.Name) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + err = fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, + pollTimeout) gomega.Expect(err).NotTo(gomega.HaveOccurred()) volumeHandle := pv.Spec.CSI.VolumeHandle err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) gomega.Expect(err).NotTo(gomega.HaveOccurred()) } - for _, pv := range persistentvolumesRetain { - err = fpv.DeletePersistentVolume(ctx, client, pv.Name) + + for _, svpvcName := range svcPVCNames { + svcPV := getPvFromClaim(svcClient, svNamespace, svpvcName) + err := fpv.DeletePersistentVolumeClaim(ctx, svcClient, svpvcName, svNamespace) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - err := fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, framework.Poll, - framework.PodDeleteTimeout) + ginkgo.By("Verify it's PV and corresponding volumes are deleted from CNS") + err = fpv.DeletePersistentVolume(ctx, svcClient, svcPV.Name) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - volumeHandle := pv.Spec.CSI.VolumeHandle + err = fpv.WaitForPersistentVolumeDeleted(ctx, svcClient, svcPV.Name, poll, + pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + volumeHandle := svcPV.Spec.CSI.VolumeHandle err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) gomega.Expect(err).NotTo(gomega.HaveOccurred()) } - }() - for i := 0; i < 4; i++ { - if i == 0 || i == 1 { - pv := getPvFromClaim(client, namespace, pvclaimsWithRetain[i].Name) - framework.Logf("Deleting pvc %v with reclaim policy Retain", pvclaimsWithRetain[i].Name) - err := fpv.DeletePersistentVolumeClaim(ctx, client, pvclaimsWithRetain[i].Name, namespace) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - framework.Logf("Deleting pv %s from pvc: %s", pv.Name, pvclaimsWithRetain[i].Name) - volHandle := pv.Spec.CSI.VolumeHandle - volHandles = append(volHandles, volHandle) - err = fpv.DeletePersistentVolume(ctx, client, pv.Name) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } else { - framework.Logf("Deleting pvc %v with reclaim policy Retain", pvclaimsWithRetain[i].Name) - err := fpv.DeletePersistentVolumeClaim(ctx, client, pvclaimsWithRetain[i].Name, namespace) + for _, fcdId := range fcdIDs { + ginkgo.By(fmt.Sprintf("Deleting FCD: %s", fcdId)) + err := e2eVSphere.deleteFCD(ctx, fcdId, defaultDatastore.Reference()) gomega.Expect(err).NotTo(gomega.HaveOccurred()) } - } - - for i := 0; i < 2; i++ { - pod, err := createPod(ctx, client, namespace, - nil, []*v1.PersistentVolumeClaim{pvclaimsWithDelete[i]}, - false, execCommand) - framework.Logf("Created pod %s", pod.Name) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - pods = append(pods, pod) - } - framework.Logf("Stopping vsan-health on the vCenter host") - vcAddress := e2eVSphere.Config.Global.VCenterHostname + ":" + sshdPort - err = invokeVCenterServiceControl(ctx, stopOperation, vsanhealthServiceName, vcAddress) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - err = waitVCenterServiceToBeInState(ctx, vsanhealthServiceName, vcAddress, svcStoppedMessage) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - isVsanHealthServiceStopped = true - - for i := 2; i < 4; i++ { - pod, err := createPod(ctx, client, namespace, - nil, []*v1.PersistentVolumeClaim{pvclaimsWithDelete[i]}, - false, execCommand) - framework.Logf("Created pod %s", pod.Name) - gomega.Expect(err).To(gomega.HaveOccurred()) - pods = append(pods, pod) - } - // Creating label for PV. - // PVC will use this label as Selector to find PV - staticPVLabels := make(map[string]string) - var staticPvcs []*v1.PersistentVolumeClaim - var staticPvs []*v1.PersistentVolume - for i := 0; i < 2; i++ { - staticPVLabels["fcd-id"] = volHandles[i] + deleteFCD = false + ginkgo.By("Bring up the secondary site") + siteRestore(false) - ginkgo.By("Creating static PV") - pv := getPersistentVolumeSpec(volHandles[i], v1.PersistentVolumeReclaimDelete, staticPVLabels, ext4FSType) - pv, err = client.CoreV1().PersistentVolumes().Create(ctx, pv, metav1.CreateOptions{}) + ginkgo.By("Wait for k8s cluster to be healthy") + // wait for the VMs to move back + err = waitForAllNodes2BeReady(ctx, client) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - staticPvs = append(staticPvs, pv) + }) - ginkgo.By("Creating PVC from static PV") - pvc := getPersistentVolumeClaimSpec(namespace, staticPVLabels, pv.Name) - pvc, err = client.CoreV1().PersistentVolumeClaims(namespace).Create(ctx, pvc, metav1.CreateOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - staticPvcs = append(staticPvcs, pvc) + /* + Site failover during full sync + Steps: + 1. Configure a vsan stretched cluster testbed. + 2. Create 6 PVCs with reclaim policy Delete and 8 with reclaim policy Retain + and wait for them to be bound + 3. Delete four PVCs with reclaim policy Retain + 4. Delete two PVs reclaim policy Retain related to PVC used in step 3 + 5. Create two pods using PVCs with reclaim policy Delete + 6. Bring vsan-health service down + 7. Create two pods with two PVCs each + 8. Create two static PVs with disk left after step 4 + 9. Create two PVCs to bind to PVs with reclaim policy Retain + 10. Delete four PVCs with reclaim policy Retain different from the ones used in step 3 and 9 + 11. Delete two PVs reclaim policy Retain related to PVC used in step 10 + 12. Add labels to all PVs, PVCs + 13. Bring vsan-health service up when full sync is triggered + 14. Bring down primary site + 15. Verify that the VMs on the primary site are started up on the other esx servers + in the secondary site + 16. Wait for full sync + 17. Verify CNS entries + 18. Delete all pods, PVCs and PVs + 19. Bring primary site up and wait for testbed to be back to normal - framework.ExpectNoError(fpv.WaitOnPVandPVC(ctx, client, framework.NewTimeoutContext(), namespace, pv, pvc)) + */ + ginkgo.It("Primary site failover during full sync", + ginkgo.Label(p0, vsanStretch, block, vanilla, wcp, tkg, primaryCentric, distributed), func() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + ginkgo.By("Creating StorageClass") + // decide which test setup is available to run + ginkgo.By("CNS_TEST: Running for vanilla k8s setup") - } + var pods []*v1.Pod + var pvclaimsWithDelete, pvclaimsWithRetain []*v1.PersistentVolumeClaim + var volHandles []string + var scRetain, scDelete *storagev1.StorageClass + var cnsOperatorClient clientgrp.Client - defer func() { - ginkgo.By("Deleting static pvcs and pvs") - for _, pvc := range staticPvcs { - err := fpv.DeletePersistentVolumeClaim(ctx, client, pvc.Name, namespace) + if vanillaCluster { + framework.Logf("Ensuring %s leader is in primary site", syncerContainerName) + err := changeLeaderOfContainerToComeUpOnMaster(ctx, client, sshClientConfig, syncerContainerName, true) gomega.Expect(err).NotTo(gomega.HaveOccurred()) } - for _, pv := range staticPvs { - err := fpv.DeletePersistentVolume(ctx, client, pv.Name) + + if vanillaCluster { + scParameters = map[string]string{} + scParameters["StoragePolicyName"] = storagePolicyName + scRetain, err = createStorageClass(client, scParameters, nil, v1.PersistentVolumeReclaimRetain, "", false, "") gomega.Expect(err).NotTo(gomega.HaveOccurred()) - err = fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, - pollTimeout) + scSpec := getVSphereStorageClassSpec("nginx-sc-delete", scParameters, nil, "", "", false) + scDelete, err = client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - volumeHandle := pv.Spec.CSI.VolumeHandle - err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + defer func() { + err := client.StorageV1().StorageClasses().Delete(ctx, scRetain.Name, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + err = client.StorageV1().StorageClasses().Delete(ctx, scDelete.Name, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + } else if guestCluster { + scParameters = map[string]string{} + scParameters[svStorageClassName] = storagePolicyName + scRetain, err = createStorageClass(client, scParameters, nil, v1.PersistentVolumeReclaimRetain, "", false, "") + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + scSpec := getVSphereStorageClassSpec("nginx-sc-delete", scParameters, nil, "", "", false) + scDelete, err = client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) } - }() - - for i := 4; i < 8; i++ { - if i == 4 || i == 5 { - pv := getPvFromClaim(client, namespace, pvclaimsWithRetain[i].Name) - framework.Logf("Deleting pvc %v with reclaim policy Retain", pvclaimsWithRetain[i].Name) - err := fpv.DeletePersistentVolumeClaim(ctx, client, pvclaimsWithRetain[i].Name, namespace) + for i := 0; i < 6; i++ { + framework.Logf("Creating pvc %v with reclaim policy Delete", i) + pvc, err := createPVC(ctx, client, namespace, nil, diskSize, scDelete, "") + pvclaimsWithDelete = append(pvclaimsWithDelete, pvc) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - err = fpv.DeletePersistentVolume(ctx, client, pv.Name) + } + + for i := 0; i < 8; i++ { + framework.Logf("Creating pvc %v with reclaim policy Retain", i) + pvc, err := createPVC(ctx, client, namespace, nil, diskSize, scRetain, "") + pvclaimsWithRetain = append(pvclaimsWithRetain, pvc) gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } - } else { - framework.Logf("Deleting pvc %v with reclaim policy Retain", pvclaimsWithRetain[i].Name) - err := fpv.DeletePersistentVolumeClaim(ctx, client, pvclaimsWithRetain[i].Name, namespace) + persistentvolumesRetain, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaimsWithRetain, + framework.ClaimProvisionTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for i := 0; i < 8; i++ { + volHandle := persistentvolumesRetain[i].Spec.CSI.VolumeHandle + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + } + + persistentvolumesDelete, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaimsWithDelete, + framework.ClaimProvisionTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for i := 0; i < 6; i++ { + volHandle := persistentvolumesDelete[i].Spec.CSI.VolumeHandle + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + } + + defer func() { + for _, claim := range pvclaimsWithDelete { + err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + for _, claim := range pvclaimsWithRetain { + err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + ginkgo.By("Verify PVs, volumes are deleted from CNS") + for _, pv := range persistentvolumesDelete { + err := fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, framework.Poll, + framework.PodDeleteTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + volumeHandle := pv.Spec.CSI.VolumeHandle + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + for _, pv := range persistentvolumesRetain { + err = fpv.DeletePersistentVolume(ctx, client, pv.Name) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + err := fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, framework.Poll, + framework.PodDeleteTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + volumeHandle := pv.Spec.CSI.VolumeHandle + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + }() + + for i := 0; i < 4; i++ { + if i == 0 || i == 1 { + pv := getPvFromClaim(client, namespace, pvclaimsWithRetain[i].Name) + framework.Logf("Deleting pvc %v with reclaim policy Retain", pvclaimsWithRetain[i].Name) + err := fpv.DeletePersistentVolumeClaim(ctx, client, pvclaimsWithRetain[i].Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + framework.Logf("Deleting pv %s from pvc: %s", pv.Name, pvclaimsWithRetain[i].Name) + volHandle := pv.Spec.CSI.VolumeHandle + if guestCluster { + volHandle = getVolumeIDFromSupervisorCluster(pv.Spec.CSI.VolumeHandle) + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + } + volHandles = append(volHandles, volHandle) + err = fpv.DeletePersistentVolume(ctx, client, pv.Name) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } else { + framework.Logf("Deleting pvc %v with reclaim policy Retain", pvclaimsWithRetain[i].Name) + err := fpv.DeletePersistentVolumeClaim(ctx, client, pvclaimsWithRetain[i].Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + } + + for i := 0; i < 2; i++ { + pod, err := createPod(ctx, client, namespace, + nil, []*v1.PersistentVolumeClaim{pvclaimsWithDelete[i]}, + false, execCommand) + framework.Logf("Created pod %s", pod.Name) gomega.Expect(err).NotTo(gomega.HaveOccurred()) + pods = append(pods, pod) } - } + framework.Logf("Stopping vsan-health on the vCenter host") + vcAddress := e2eVSphere.Config.Global.VCenterHostname + ":" + sshdPort + err = invokeVCenterServiceControl(ctx, stopOperation, vsanhealthServiceName, vcAddress) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + err = waitVCenterServiceToBeInState(ctx, vsanhealthServiceName, vcAddress, svcStoppedMessage) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + isVsanHealthServiceStopped = true - framework.Logf("Sleeping full-sync interval for pvcs to be " + - "fully deleted") - time.Sleep(time.Duration(60) * time.Second) + for i := 2; i < 4; i++ { + pod, err := createPod(ctx, client, namespace, + nil, []*v1.PersistentVolumeClaim{pvclaimsWithDelete[i]}, + false, execCommand) + framework.Logf("Created pod %s", pod.Name) + gomega.Expect(err).To(gomega.HaveOccurred()) + pods = append(pods, pod) + } - labels := make(map[string]string) - labels[labelKey] = labelValue + // Creating label for PV. + // PVC will use this label as Selector to find PV + staticPVLabels := make(map[string]string) + var staticPvcs []*v1.PersistentVolumeClaim + var staticPvs []*v1.PersistentVolume + for i := 0; i < 2; i++ { + staticPVLabels["fcd-id"] = volHandles[i] + + ginkgo.By("Creating static PV") + pv := getPersistentVolumeSpec(volHandles[i], v1.PersistentVolumeReclaimDelete, staticPVLabels, ext4FSType) + pv, err = client.CoreV1().PersistentVolumes().Create(ctx, pv, metav1.CreateOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + staticPvs = append(staticPvs, pv) - allPvcs, err := client.CoreV1().PersistentVolumeClaims(namespace).List(ctx, metav1.ListOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - for _, pvc := range allPvcs.Items { - framework.Logf("Updating labels %+v for pvc %s in namespace %s", - labels, pvc.Name, namespace) - pvc, err := client.CoreV1().PersistentVolumeClaims(namespace).Get(ctx, pvc.Name, metav1.GetOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - pvc.Labels = labels - _, err = client.CoreV1().PersistentVolumeClaims(namespace).Update(ctx, pvc, metav1.UpdateOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred(), - "Error on updating pvc labels is: %v", err) - } + ginkgo.By("Creating PVC from static PV") + pvc := getPersistentVolumeClaimSpec(namespace, staticPVLabels, pv.Name) + pvc, err = client.CoreV1().PersistentVolumeClaims(namespace).Create(ctx, pvc, metav1.CreateOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + staticPvcs = append(staticPvcs, pvc) - allPvs, err := client.CoreV1().PersistentVolumes().List(ctx, metav1.ListOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - for _, pv := range allPvs.Items { - framework.Logf("Updating labels %+v for pv %s in namespace %s", - labels, pv.Name, namespace) - pv, err := client.CoreV1().PersistentVolumes().Get(ctx, pv.Name, metav1.GetOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - pv.Labels = labels - _, err = client.CoreV1().PersistentVolumes().Update(ctx, pv, metav1.UpdateOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred(), - "Error on updating pv labels is: %v", err) - } + framework.ExpectNoError(fpv.WaitOnPVandPVC(ctx, client, framework.NewTimeoutContext(), namespace, pv, pvc)) - framework.Logf("Starting vsan-health on the vCenter host") - vcAddress = e2eVSphere.Config.Global.VCenterHostname + ":" + sshdPort - startVCServiceWait4VPs(ctx, vcAddress, vsanhealthServiceName, &isVsanHealthServiceStopped) + } - framework.Logf("Sleeping full-sync interval for vsan health service " + - "to be fully up") - time.Sleep(time.Duration(300) * time.Second) + defer func() { + ginkgo.By("Deleting static pvcs and pvs") + for _, pvc := range staticPvcs { + err := fpv.DeletePersistentVolumeClaim(ctx, client, pvc.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + for _, pv := range staticPvs { + err := fpv.DeletePersistentVolume(ctx, client, pv.Name) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + err = fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, + pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + volumeHandle := pv.Spec.CSI.VolumeHandle + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } - csipods, err := client.CoreV1().Pods(csiSystemNamespace).List(ctx, metav1.ListOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - // Get restConfig. - restConfig := getRestConfigClient() - cnsOperatorClient, err := k8s.NewClientForGroup(ctx, restConfig, cnsoperatorv1alpha1.GroupName) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - enableFullSyncTriggerFss(ctx, client, csiSystemNamespace, fullSyncFss) - ginkgo.By("Bring down the primary site while full sync is going on") - var wg sync.WaitGroup - wg.Add(2) - go triggerFullSyncInParallel(ctx, cnsOperatorClient, &wg) - go siteFailureInParallel(ctx, true, &wg) - wg.Wait() + }() - defer func() { - ginkgo.By("Bring up the primary site before terminating the test") - if len(fds.hostsDown) > 0 { - siteRestore(true) - fds.hostsDown = []string{} + for i := 4; i < 8; i++ { + if i == 4 || i == 5 { + pv := getPvFromClaim(client, namespace, pvclaimsWithRetain[i].Name) + framework.Logf("Deleting pvc %v with reclaim policy Retain", pvclaimsWithRetain[i].Name) + err := fpv.DeletePersistentVolumeClaim(ctx, client, pvclaimsWithRetain[i].Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + err = fpv.DeletePersistentVolume(ctx, client, pv.Name) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + } else { + framework.Logf("Deleting pvc %v with reclaim policy Retain", pvclaimsWithRetain[i].Name) + err := fpv.DeletePersistentVolumeClaim(ctx, client, pvclaimsWithRetain[i].Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } } - }() - ginkgo.By("Wait for k8s cluster to be healthy") - wait4AllK8sNodesToBeUp(ctx, client, nodeList) - err = waitForAllNodes2BeReady(ctx, client, pollTimeout*4) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + framework.Logf("Sleeping full-sync interval for pvcs to be " + + "fully deleted") + time.Sleep(time.Duration(60) * time.Second) - // Check if csi pods are running fine after site failure - err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout*2) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + labels := make(map[string]string) + labels[labelKey] = labelValue - ginkgo.By("Trigger 2 full syncs as full sync might be interrupted during site failover") - triggerFullSync(ctx, cnsOperatorClient) + allPvcs, err := client.CoreV1().PersistentVolumeClaims(namespace).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for _, pvc := range allPvcs.Items { + framework.Logf("Updating labels %+v for pvc %s in namespace %s", + labels, pvc.Name, namespace) + pvc, err := client.CoreV1().PersistentVolumeClaims(namespace).Get(ctx, pvc.Name, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + pvc.Labels = labels + _, err = client.CoreV1().PersistentVolumeClaims(namespace).Update(ctx, pvc, metav1.UpdateOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred(), + "Error on updating pvc labels is: %v", err) + } - ginkgo.By("Checking whether pods are in Running state") - for _, pod := range pods { - framework.Logf("Pod is %s", pod.Name) - err = fpod.WaitForPodNameRunningInNamespace(ctx, client, pod.Name, namespace) + allPvs, err := client.CoreV1().PersistentVolumes().List(ctx, metav1.ListOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } + for _, pv := range allPvs.Items { + framework.Logf("Updating labels %+v for pv %s in namespace %s", + labels, pv.Name, namespace) + pv, err := client.CoreV1().PersistentVolumes().Get(ctx, pv.Name, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + pv.Labels = labels + _, err = client.CoreV1().PersistentVolumes().Update(ctx, pv, metav1.UpdateOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred(), + "Error on updating pv labels is: %v", err) + } - for _, pvc := range allPvcs.Items { - ginkgo.By(fmt.Sprintf("Verifying labels %+v are updated for pvc %s in namespace %s", - labels, pvc.Name, namespace)) - pv := getPvFromClaim(client, namespace, pvc.Name) - err = e2eVSphere.verifyLabelsAreUpdated(pv.Spec.CSI.VolumeHandle, labels, - string(cnstypes.CnsKubernetesEntityTypePV), pv.Name, pv.Namespace) + framework.Logf("Starting vsan-health on the vCenter host") + vcAddress = e2eVSphere.Config.Global.VCenterHostname + ":" + sshdPort + startVCServiceWait4VPs(ctx, vcAddress, vsanhealthServiceName, &isVsanHealthServiceStopped) + + framework.Logf("Sleeping full-sync interval for vsan health service " + + "to be fully up") + time.Sleep(time.Duration(300) * time.Second) + + csipods, err := client.CoreV1().Pods(csiSystemNamespace).List(ctx, metav1.ListOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } + if vanillaCluster { + // Get restConfig. + restConfig := getRestConfigClient() + cnsOperatorClient, err = k8s.NewClientForGroup(ctx, restConfig, cnsoperatorv1alpha1.GroupName) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + enableFullSyncTriggerFss(ctx, client, csiSystemNamespace, fullSyncFss) + ginkgo.By("Bring down the primary site while full sync is going on") + var wg sync.WaitGroup + + wg.Add(2) + go triggerFullSyncInParallel(ctx, cnsOperatorClient, &wg) + go siteFailureInParallel(ctx, true, &wg) + wg.Wait() + } else { + framework.Logf("Sleeping full-sync interval time") + time.Sleep(time.Duration(120) * time.Second) + siteFailover(ctx, true) + } + + defer func() { + ginkgo.By("Bring up the primary site before terminating the test") + if len(fds.hostsDown) > 0 { + siteRestore(true) + fds.hostsDown = []string{} + } + }() + + ginkgo.By("Wait for k8s cluster to be healthy") + if vanillaCluster { + wait4AllK8sNodesToBeUp(nodeList) + } + if vanillaCluster || guestCluster { + err = waitForAllNodes2BeReady(ctx, client, pollTimeout*4) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + // Check if csi pods are running fine after site failure + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout*2) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + if vanillaCluster { + ginkgo.By("Trigger 2 full syncs as full sync might be interrupted during site failover") + triggerFullSync(ctx, cnsOperatorClient) + } else { + framework.Logf("Sleeping for %d seconds for full sync time interval", fullSyncWaitTime) + time.Sleep(time.Duration(fullSyncWaitTime) * time.Second) + } + + ginkgo.By("Checking whether pods are in Running state") + for _, pod := range pods { + framework.Logf("Pod is %s", pod.Name) + err = fpod.WaitForPodNameRunningInNamespace(ctx, client, pod.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + for _, pvc := range allPvcs.Items { + ginkgo.By(fmt.Sprintf("Verifying labels %+v are updated for pvc %s in namespace %s", + labels, pvc.Name, namespace)) + pv := getPvFromClaim(client, namespace, pvc.Name) + volHandle := pv.Spec.CSI.VolumeHandle + if guestCluster { + volHandle = getVolumeIDFromSupervisorCluster(volHandle) + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + } + + err = e2eVSphere.waitForLabelsToBeUpdated(volHandle, labels, + string(cnstypes.CnsKubernetesEntityTypePVC), pvc.Name, pvc.Namespace) + } + + for _, pv := range allPvs.Items { + ginkgo.By(fmt.Sprintf("Verifying labels %+v are updated for pv %s", + labels, pv.Name)) + volHandle := pv.Spec.CSI.VolumeHandle + if guestCluster { + volHandle = getVolumeIDFromSupervisorCluster(volHandle) + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + } - for _, pv := range allPvs.Items { - ginkgo.By(fmt.Sprintf("Verifying labels %+v are updated for pv %s", - labels, pv.Name)) - err = e2eVSphere.verifyLabelsAreUpdated(pv.Spec.CSI.VolumeHandle, labels, - string(cnstypes.CnsKubernetesEntityTypePV), pv.Name, pv.Namespace) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } + err = e2eVSphere.waitForLabelsToBeUpdated(volHandle, labels, + string(cnstypes.CnsKubernetesEntityTypePV), pv.Name, pv.Namespace) + } - // Deleting all the pods in the namespace - for _, pod := range pods { - ginkgo.By(fmt.Sprintf("Deleting the pod %s in namespace %s", pod.Name, namespace)) - err = fpod.DeletePodWithWait(ctx, client, pod) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } + // Deleting all the pods in the namespace + for _, pod := range pods { + ginkgo.By(fmt.Sprintf("Deleting the pod %s in namespace %s", pod.Name, namespace)) + err = fpod.DeletePodWithWait(ctx, client, pod) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } - ginkgo.By("Bring up the primary site") - if len(fds.hostsDown) > 0 { - siteRestore(true) - fds.hostsDown = []string{} - } + ginkgo.By("Bring up the primary site") + if len(fds.hostsDown) > 0 { + siteRestore(true) + fds.hostsDown = []string{} + } - ginkgo.By("Wait for k8s cluster to be healthy") - // wait for the VMs to move back - err = waitForAllNodes2BeReady(ctx, client, pollTimeout*4) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Wait for k8s cluster to be healthy") + // wait for the VMs to move back + err = waitForAllNodes2BeReady(ctx, client, pollTimeout*4) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }) + }) /* Secondary site failover during full sync when syncer pod leader is in secondary site @@ -3999,318 +4418,544 @@ var _ = ginkgo.Describe("[vsan-stretch-vanilla] vsan stretched cluster tests", f 19. Bring secondary site up and wait for testbed to be back to normal */ - ginkgo.It("[distributed] Secondary site failover during full sync when syncer"+ - " pod leader is in secondary site", func() { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - ginkgo.By("Creating StorageClass") - // decide which test setup is available to run - ginkgo.By("CNS_TEST: Running for vanilla k8s setup") - scParameters = map[string]string{} - scParameters["StoragePolicyName"] = storagePolicyName - storageClassName = "nginx-sc-delete" - var pods []*v1.Pod - var pvclaimsWithDelete, pvclaimsWithRetain []*v1.PersistentVolumeClaim - var volHandles []string - - framework.Logf("Ensuring %s leader is in secondary site", syncerContainerName) - err := changeLeaderOfContainerToComeUpOnMaster(ctx, client, sshClientConfig, syncerContainerName, false) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.It("Secondary site failover during full sync when syncer"+ + " pod leader is in secondary site", + ginkgo.Label(p0, vsanStretch, block, vanilla, distributed), func() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + ginkgo.By("Creating StorageClass") + // decide which test setup is available to run + ginkgo.By("CNS_TEST: Running for vanilla k8s setup") + scParameters = map[string]string{} + scParameters["StoragePolicyName"] = storagePolicyName + storageClassName = "nginx-sc-delete" + var pods []*v1.Pod + var pvclaimsWithDelete, pvclaimsWithRetain []*v1.PersistentVolumeClaim + var volHandles []string - scRetain, err := createStorageClass(client, scParameters, nil, v1.PersistentVolumeReclaimRetain, "", false, "") - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + framework.Logf("Ensuring %s leader is in secondary site", syncerContainerName) + err := changeLeaderOfContainerToComeUpOnMaster(ctx, client, sshClientConfig, syncerContainerName, false) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) - scSpec := getVSphereStorageClassSpec(storageClassName, scParameters, nil, "", "", false) - scDelete, err := client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - defer func() { - err := client.StorageV1().StorageClasses().Delete(ctx, scRetain.Name, *metav1.NewDeleteOptions(0)) + scRetain, err := createStorageClass(client, scParameters, nil, v1.PersistentVolumeReclaimRetain, "", false, "") gomega.Expect(err).NotTo(gomega.HaveOccurred()) - err = client.StorageV1().StorageClasses().Delete(ctx, scDelete.Name, *metav1.NewDeleteOptions(0)) + + scSpec := getVSphereStorageClassSpec(storageClassName, scParameters, nil, "", "", false) + scDelete, err := client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - }() + defer func() { + err := client.StorageV1().StorageClasses().Delete(ctx, scRetain.Name, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + err = client.StorageV1().StorageClasses().Delete(ctx, scDelete.Name, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() + + for i := 0; i < 6; i++ { + framework.Logf("Creating pvc %v with reclaim policy Delete", i) + pvc, err := createPVC(ctx, client, namespace, nil, diskSize, scDelete, "") + pvclaimsWithDelete = append(pvclaimsWithDelete, pvc) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } - for i := 0; i < 6; i++ { - framework.Logf("Creating pvc %v with reclaim policy Delete", i) - pvc, err := createPVC(ctx, client, namespace, nil, diskSize, scDelete, "") - pvclaimsWithDelete = append(pvclaimsWithDelete, pvc) + for i := 0; i < 8; i++ { + framework.Logf("Creating pvc %v with reclaim policy Retain", i) + pvc, err := createPVC(ctx, client, namespace, nil, diskSize, scRetain, "") + pvclaimsWithRetain = append(pvclaimsWithRetain, pvc) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + persistentvolumesRetain, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaimsWithRetain, + framework.ClaimProvisionTimeout) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } + for i := 0; i < 8; i++ { + volHandle := persistentvolumesRetain[i].Spec.CSI.VolumeHandle + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + } - for i := 0; i < 8; i++ { - framework.Logf("Creating pvc %v with reclaim policy Retain", i) - pvc, err := createPVC(ctx, client, namespace, nil, diskSize, scRetain, "") - pvclaimsWithRetain = append(pvclaimsWithRetain, pvc) + persistentvolumesDelete, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaimsWithDelete, + framework.ClaimProvisionTimeout) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } + for i := 0; i < 6; i++ { + volHandle := persistentvolumesDelete[i].Spec.CSI.VolumeHandle + gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) + } - persistentvolumesRetain, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaimsWithRetain, - framework.ClaimProvisionTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - for i := 0; i < 8; i++ { - volHandle := persistentvolumesRetain[i].Spec.CSI.VolumeHandle - gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) - } + defer func() { + for _, claim := range pvclaimsWithDelete { + err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + for _, claim := range pvclaimsWithRetain { + err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + ginkgo.By("Verify PVs, volumes are deleted from CNS") + for _, pv := range persistentvolumesDelete { + err := fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, framework.Poll, + framework.PodDeleteTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + volumeHandle := pv.Spec.CSI.VolumeHandle + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + for _, pv := range persistentvolumesRetain { + err = fpv.DeletePersistentVolume(ctx, client, pv.Name) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + err := fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, framework.Poll, + framework.PodDeleteTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + volumeHandle := pv.Spec.CSI.VolumeHandle + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + }() - persistentvolumesDelete, err := fpv.WaitForPVClaimBoundPhase(ctx, client, pvclaimsWithDelete, - framework.ClaimProvisionTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - for i := 0; i < 6; i++ { - volHandle := persistentvolumesDelete[i].Spec.CSI.VolumeHandle - gomega.Expect(volHandle).NotTo(gomega.BeEmpty()) - } + for i := 0; i < 4; i++ { + if i == 0 || i == 1 { + pv := getPvFromClaim(client, namespace, pvclaimsWithRetain[i].Name) + framework.Logf("Deleting pvc %v with reclaim policy Retain", pvclaimsWithRetain[i].Name) + err := fpv.DeletePersistentVolumeClaim(ctx, client, pvclaimsWithRetain[i].Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + framework.Logf("Deleting pv %s from pvc: %s", pv.Name, pvclaimsWithRetain[i].Name) + volHandle := pv.Spec.CSI.VolumeHandle + volHandles = append(volHandles, volHandle) + err = fpv.DeletePersistentVolume(ctx, client, pv.Name) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } else { + framework.Logf("Deleting pvc %v with reclaim policy Retain", pvclaimsWithRetain[i].Name) + err := fpv.DeletePersistentVolumeClaim(ctx, client, pvclaimsWithRetain[i].Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + } - defer func() { - for _, claim := range pvclaimsWithDelete { - err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) + for i := 0; i < 2; i++ { + pod, err := createPod(ctx, client, + namespace, nil, + []*v1.PersistentVolumeClaim{pvclaimsWithDelete[i]}, + false, execCommand) + framework.Logf("Created pod %s", pod.Name) gomega.Expect(err).NotTo(gomega.HaveOccurred()) + pods = append(pods, pod) } - for _, claim := range pvclaimsWithRetain { - err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + framework.Logf("Stopping vsan-health on the vCenter host") + vcAddress := e2eVSphere.Config.Global.VCenterHostname + ":" + sshdPort + err = invokeVCenterServiceControl(ctx, stopOperation, vsanhealthServiceName, vcAddress) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + err = waitVCenterServiceToBeInState(ctx, vsanhealthServiceName, vcAddress, svcStoppedMessage) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + isVsanHealthServiceStopped = true + + for i := 2; i < 4; i++ { + pod, err := createPod(ctx, client, namespace, + nil, []*v1.PersistentVolumeClaim{pvclaimsWithDelete[i]}, + false, execCommand) + framework.Logf("Created pod %s", pod.Name) + pods = append(pods, pod) + gomega.Expect(err).To(gomega.HaveOccurred()) } - ginkgo.By("Verify PVs, volumes are deleted from CNS") - for _, pv := range persistentvolumesDelete { - err := fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, framework.Poll, - framework.PodDeleteTimeout) + + // Creating label for PV. + // PVC will use this label as Selector to find PV + staticPVLabels := make(map[string]string) + var staticPvcs []*v1.PersistentVolumeClaim + var staticPvs []*v1.PersistentVolume + + for i := 0; i < 2; i++ { + staticPVLabels["fcd-id"] = volHandles[i] + + ginkgo.By("Creating static PV") + pv := getPersistentVolumeSpec(volHandles[i], v1.PersistentVolumeReclaimDelete, staticPVLabels, ext4FSType) + pv, err = client.CoreV1().PersistentVolumes().Create(ctx, pv, metav1.CreateOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - volumeHandle := pv.Spec.CSI.VolumeHandle - err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + staticPvs = append(staticPvs, pv) + + ginkgo.By("Creating PVC from static PV") + pvc := getPersistentVolumeClaimSpec(namespace, staticPVLabels, pv.Name) + pvc, err = client.CoreV1().PersistentVolumeClaims(namespace).Create(ctx, pvc, metav1.CreateOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) + staticPvcs = append(staticPvcs, pvc) + + framework.ExpectNoError(fpv.WaitOnPVandPVC(ctx, client, framework.NewTimeoutContext(), namespace, pv, pvc)) + } - for _, pv := range persistentvolumesRetain { - err = fpv.DeletePersistentVolume(ctx, client, pv.Name) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - err := fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, framework.Poll, - framework.PodDeleteTimeout) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - volumeHandle := pv.Spec.CSI.VolumeHandle - err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + defer func() { + ginkgo.By("Deleting static pvcs and pvs") + for _, pvc := range staticPvcs { + err := fpv.DeletePersistentVolumeClaim(ctx, client, pvc.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + for _, pv := range staticPvs { + err := fpv.DeletePersistentVolume(ctx, client, pv.Name) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + err = fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, + pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + volumeHandle := pv.Spec.CSI.VolumeHandle + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + }() + + for i := 4; i < 8; i++ { + if i == 4 || i == 5 { + pv := getPvFromClaim(client, namespace, pvclaimsWithRetain[i].Name) + framework.Logf("Deleting pvc %v with reclaim policy Retain", pvclaimsWithRetain[i].Name) + err := fpv.DeletePersistentVolumeClaim(ctx, client, pvclaimsWithRetain[i].Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + err = fpv.DeletePersistentVolume(ctx, client, pv.Name) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } else { + framework.Logf("Deleting pvc %v with reclaim policy Retain", pvclaimsWithRetain[i].Name) + err := fpv.DeletePersistentVolumeClaim(ctx, client, pvclaimsWithRetain[i].Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } } - }() - for i := 0; i < 4; i++ { - if i == 0 || i == 1 { - pv := getPvFromClaim(client, namespace, pvclaimsWithRetain[i].Name) - framework.Logf("Deleting pvc %v with reclaim policy Retain", pvclaimsWithRetain[i].Name) - err := fpv.DeletePersistentVolumeClaim(ctx, client, pvclaimsWithRetain[i].Name, namespace) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - framework.Logf("Deleting pv %s from pvc: %s", pv.Name, pvclaimsWithRetain[i].Name) - volHandle := pv.Spec.CSI.VolumeHandle - volHandles = append(volHandles, volHandle) - err = fpv.DeletePersistentVolume(ctx, client, pv.Name) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } else { - framework.Logf("Deleting pvc %v with reclaim policy Retain", pvclaimsWithRetain[i].Name) - err := fpv.DeletePersistentVolumeClaim(ctx, client, pvclaimsWithRetain[i].Name, namespace) + framework.Logf("Sleeping full-sync interval for pvcs to be " + + "fully deleted") + time.Sleep(time.Duration(60) * time.Second) + + labels := make(map[string]string) + labels[labelKey] = labelValue + + allPvcs, err := client.CoreV1().PersistentVolumeClaims(namespace).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for _, pvc := range allPvcs.Items { + framework.Logf("Updating labels %+v for pvc %s in namespace %s", + labels, pvc.Name, namespace) + pvc, err := client.CoreV1().PersistentVolumeClaims(namespace).Get(ctx, pvc.Name, metav1.GetOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) + pvc.Labels = labels + _, err = client.CoreV1().PersistentVolumeClaims(namespace).Update(ctx, pvc, metav1.UpdateOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred(), + "Error on updating pvc labels is: %v", err) } - } - for i := 0; i < 2; i++ { - pod, err := createPod(ctx, client, - namespace, nil, - []*v1.PersistentVolumeClaim{pvclaimsWithDelete[i]}, - false, execCommand) - framework.Logf("Created pod %s", pod.Name) + allPvs, err := client.CoreV1().PersistentVolumes().List(ctx, metav1.ListOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - pods = append(pods, pod) - } - framework.Logf("Stopping vsan-health on the vCenter host") - vcAddress := e2eVSphere.Config.Global.VCenterHostname + ":" + sshdPort - err = invokeVCenterServiceControl(ctx, stopOperation, vsanhealthServiceName, vcAddress) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - err = waitVCenterServiceToBeInState(ctx, vsanhealthServiceName, vcAddress, svcStoppedMessage) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - isVsanHealthServiceStopped = true - - for i := 2; i < 4; i++ { - pod, err := createPod(ctx, client, namespace, - nil, []*v1.PersistentVolumeClaim{pvclaimsWithDelete[i]}, - false, execCommand) - framework.Logf("Created pod %s", pod.Name) - pods = append(pods, pod) - gomega.Expect(err).To(gomega.HaveOccurred()) - } + for _, pv := range allPvs.Items { + framework.Logf("Updating labels %+v for pv %s in namespace %s", + labels, pv.Name, namespace) + pv, err := client.CoreV1().PersistentVolumes().Get(ctx, pv.Name, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + pv.Labels = labels + _, err = client.CoreV1().PersistentVolumes().Update(ctx, pv, metav1.UpdateOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred(), + "Error on updating pv labels is: %v", err) + } - // Creating label for PV. - // PVC will use this label as Selector to find PV - staticPVLabels := make(map[string]string) - var staticPvcs []*v1.PersistentVolumeClaim - var staticPvs []*v1.PersistentVolume + framework.Logf("Starting vsan-health on the vCenter host") + vcAddress = e2eVSphere.Config.Global.VCenterHostname + ":" + sshdPort + startVCServiceWait4VPs(ctx, vcAddress, vsanhealthServiceName, &isVsanHealthServiceStopped) - for i := 0; i < 2; i++ { - staticPVLabels["fcd-id"] = volHandles[i] + framework.Logf("Sleeping full-sync interval for vsan health service " + + "to be fully up") + time.Sleep(time.Duration(300) * time.Second) - ginkgo.By("Creating static PV") - pv := getPersistentVolumeSpec(volHandles[i], v1.PersistentVolumeReclaimDelete, staticPVLabels, ext4FSType) - pv, err = client.CoreV1().PersistentVolumes().Create(ctx, pv, metav1.CreateOptions{}) + csipods, err := client.CoreV1().Pods(csiSystemNamespace).List(ctx, metav1.ListOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - staticPvs = append(staticPvs, pv) + // Get restConfig. + restConfig := getRestConfigClient() + cnsOperatorClient, err := k8s.NewClientForGroup(ctx, restConfig, cnsoperatorv1alpha1.GroupName) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + enableFullSyncTriggerFss(ctx, client, csiSystemNamespace, fullSyncFss) + ginkgo.By("Bring down the secondary site while full sync is going on") + var wg sync.WaitGroup + wg.Add(2) + go triggerFullSyncInParallel(ctx, cnsOperatorClient, &wg) + go siteFailureInParallel(ctx, false, &wg) + wg.Wait() + + defer func() { + ginkgo.By("Bring up the secondary site before terminating the test") + if len(fds.hostsDown) > 0 { + siteRestore(false) + fds.hostsDown = nil + } + }() - ginkgo.By("Creating PVC from static PV") - pvc := getPersistentVolumeClaimSpec(namespace, staticPVLabels, pv.Name) - pvc, err = client.CoreV1().PersistentVolumeClaims(namespace).Create(ctx, pvc, metav1.CreateOptions{}) + ginkgo.By("Wait for k8s cluster to be healthy") + wait4AllK8sNodesToBeUp(nodeList) + err = waitForAllNodes2BeReady(ctx, client, pollTimeout*4) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - staticPvcs = append(staticPvcs, pvc) - framework.ExpectNoError(fpv.WaitOnPVandPVC(ctx, client, framework.NewTimeoutContext(), namespace, pv, pvc)) + // Check if csi pods are running fine after site failure + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout*2) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } + ginkgo.By("Trigger 2 full syncs as full sync might be interrupted during site failover") + triggerFullSync(ctx, cnsOperatorClient) - defer func() { - ginkgo.By("Deleting static pvcs and pvs") - for _, pvc := range staticPvcs { - err := fpv.DeletePersistentVolumeClaim(ctx, client, pvc.Name, namespace) + ginkgo.By("Checking whether pods are in Running state") + for _, pod := range pods { + framework.Logf("Pod is %s", pod.Name) + err = waitForPodsToBeInErrorOrRunning(ctx, client, pod.Name, namespace, pollTimeout*4) gomega.Expect(err).NotTo(gomega.HaveOccurred()) } - for _, pv := range staticPvs { - err := fpv.DeletePersistentVolume(ctx, client, pv.Name) + + for _, pvc := range allPvcs.Items { + ginkgo.By(fmt.Sprintf("Verifying labels %+v are updated for pvc %s in namespace %s", + labels, pvc.Name, namespace)) + pv := getPvFromClaim(client, namespace, pvc.Name) + err = e2eVSphere.verifyLabelsAreUpdated(pv.Spec.CSI.VolumeHandle, labels, + string(cnstypes.CnsKubernetesEntityTypePV), pv.Name, pv.Namespace) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - err = fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, - pollTimeout) + + } + + for _, pv := range allPvs.Items { + ginkgo.By(fmt.Sprintf("Verifying labels %+v are updated for pv %s", + labels, pv.Name)) + err = e2eVSphere.verifyLabelsAreUpdated(pv.Spec.CSI.VolumeHandle, labels, + string(cnstypes.CnsKubernetesEntityTypePV), pv.Name, pv.Namespace) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - volumeHandle := pv.Spec.CSI.VolumeHandle - err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + + } + + // Deleting all pods in namespace + for _, pod := range pods { + ginkgo.By(fmt.Sprintf("Deleting the pod %s in namespace %s", pod.Name, namespace)) + err = fpod.DeletePodWithWait(ctx, client, pod) gomega.Expect(err).NotTo(gomega.HaveOccurred()) } - }() + ginkgo.By("Bring up the secondary site") + if len(fds.hostsDown) > 0 { + siteRestore(false) + fds.hostsDown = []string{} + } - for i := 4; i < 8; i++ { - if i == 4 || i == 5 { - pv := getPvFromClaim(client, namespace, pvclaimsWithRetain[i].Name) - framework.Logf("Deleting pvc %v with reclaim policy Retain", pvclaimsWithRetain[i].Name) - err := fpv.DeletePersistentVolumeClaim(ctx, client, pvclaimsWithRetain[i].Name, namespace) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - err = fpv.DeletePersistentVolume(ctx, client, pv.Name) + ginkgo.By("Wait for k8s cluster to be healthy") + // wait for the VMs to move back + err = waitForAllNodes2BeReady(ctx, client, pollTimeout*4) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + }) + + /* + PSOD hosts on secondary site + Steps: + 1. Configure a vanilla multi-master K8s cluster with inter and intra site replication + 2. Create two statefulset with replica count 1(sts1) and 5(sts2) respectively using a thick provision policy + and wait for all replicas to be running + 3. Change replica count of sts1 and sts2 to 3 + 4. Bring down primary site + 5. Verify that the VMs on the primary site are started up on the other esx servers in the secondary site + 6. Verify there were no issue with replica scale up/down and verify pod entry in CNS volumemetadata for the + volumes associated with the PVC used by statefulsets are updated + 7. Change replica count of sts1 to 5 a sts2 to 1 and verify they are successful + 8. Delete statefulsets and its pvcs created in step 2 + 9. Bring primary site up and wait for testbed to be back to normal + */ + ginkgo.It("PSOD hosts on secondary site", + ginkgo.Label(p0, vsanStretch, block, wcp, tkg), func() { + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + var svcCsipods, csipods *v1.PodList + var sts1Replicas, sts2Replicas, dep1ReplicaCount, dep2ReplicaCount int32 + + ginkgo.By("Creating StorageClass") + if vanillaCluster { + ginkgo.By("CNS_TEST: Running for vanilla k8s setup") + scParameters = map[string]string{} + scParameters["StoragePolicyName"] = storagePolicyName + scSpec := getVSphereStorageClassSpec(defaultNginxStorageClassName, scParameters, nil, "", "", false) + sc, err = client.StorageV1().StorageClasses().Create(ctx, scSpec, metav1.CreateOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) + defer func() { + err := client.StorageV1().StorageClasses().Delete(ctx, sc.Name, *metav1.NewDeleteOptions(0)) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }() } else { - framework.Logf("Deleting pvc %v with reclaim policy Retain", pvclaimsWithRetain[i].Name) - err := fpv.DeletePersistentVolumeClaim(ctx, client, pvclaimsWithRetain[i].Name, namespace) + ginkgo.By("CNS_TEST: Running for GC setup") + sc, err = client.StorageV1().StorageClasses().Get(ctx, storagePolicyName, metav1.GetOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) } - } - framework.Logf("Sleeping full-sync interval for pvcs to be " + - "fully deleted") - time.Sleep(time.Duration(60) * time.Second) + ginkgo.By("Creating service") + service := CreateService(namespace, client) + defer func() { + deleteService(namespace, client, service) + }() - labels := make(map[string]string) - labels[labelKey] = labelValue + ginkgo.By("Creating statefulsets sts1 with replica count 1 and sts2 with 5 and wait for all" + + "the replicas to be running") - allPvcs, err := client.CoreV1().PersistentVolumeClaims(namespace).List(ctx, metav1.ListOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - for _, pvc := range allPvcs.Items { - framework.Logf("Updating labels %+v for pvc %s in namespace %s", - labels, pvc.Name, namespace) - pvc, err := client.CoreV1().PersistentVolumeClaims(namespace).Get(ctx, pvc.Name, metav1.GetOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - pvc.Labels = labels - _, err = client.CoreV1().PersistentVolumeClaims(namespace).Update(ctx, pvc, metav1.UpdateOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred(), - "Error on updating pvc labels is: %v", err) - } + if rwxAccessMode { + dep1ReplicaCount = 3 + dep2ReplicaCount = 5 + } else { + dep1ReplicaCount = 1 + dep2ReplicaCount = 1 + } + sts1Replicas = 1 + sts2Replicas = 5 + statefulset1, deployment1, _ := createStsDeployment(ctx, client, namespace, sc, true, + false, sts1Replicas, "web", dep1ReplicaCount, accessMode) + statefulset2, deployment2, _ := createStsDeployment(ctx, client, namespace, sc, true, + true, sts2Replicas, "web-nginx", dep2ReplicaCount, accessMode) + ss2PodsBeforeScaleDown := fss.GetPodList(ctx, client, statefulset2) - allPvs, err := client.CoreV1().PersistentVolumes().List(ctx, metav1.ListOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - for _, pv := range allPvs.Items { - framework.Logf("Updating labels %+v for pv %s in namespace %s", - labels, pv.Name, namespace) - pv, err := client.CoreV1().PersistentVolumes().Get(ctx, pv.Name, metav1.GetOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - pv.Labels = labels - _, err = client.CoreV1().PersistentVolumes().Update(ctx, pv, metav1.UpdateOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred(), - "Error on updating pv labels is: %v", err) - } + defer func() { + scaleDownNDeleteStsDeploymentsInNamespace(ctx, client, namespace) + pvcs, err := client.CoreV1().PersistentVolumeClaims(namespace).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + for _, claim := range pvcs.Items { + pv := getPvFromClaim(client, namespace, claim.Name) + err := fpv.DeletePersistentVolumeClaim(ctx, client, claim.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Verify it's PV and corresponding volumes are deleted from CNS") + err = fpv.WaitForPersistentVolumeDeleted(ctx, client, pv.Name, poll, + pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + volumeHandle := pv.Spec.CSI.VolumeHandle + err = e2eVSphere.waitForCNSVolumeToBeDeleted(volumeHandle) + gomega.Expect(err).NotTo(gomega.HaveOccurred(), + fmt.Sprintf("Volume: %s should not be present in the CNS after it is deleted from "+ + "kubernetes", volumeHandle)) + } + }() - framework.Logf("Starting vsan-health on the vCenter host") - vcAddress = e2eVSphere.Config.Global.VCenterHostname + ":" + sshdPort - startVCServiceWait4VPs(ctx, vcAddress, vsanhealthServiceName, &isVsanHealthServiceStopped) + csipods, err = client.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) - framework.Logf("Sleeping full-sync interval for vsan health service " + - "to be fully up") - time.Sleep(time.Duration(300) * time.Second) + if guestCluster { + svcCsipods, err = svcClient.CoreV1().Pods(csiNs).List(ctx, metav1.ListOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } - csipods, err := client.CoreV1().Pods(csiSystemNamespace).List(ctx, metav1.ListOptions{}) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - // Get restConfig. - restConfig := getRestConfigClient() - cnsOperatorClient, err := k8s.NewClientForGroup(ctx, restConfig, cnsoperatorv1alpha1.GroupName) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) - enableFullSyncTriggerFss(ctx, client, csiSystemNamespace, fullSyncFss) - ginkgo.By("Bring down the secondary site while full sync is going on") - var wg sync.WaitGroup - wg.Add(2) - go triggerFullSyncInParallel(ctx, cnsOperatorClient, &wg) - go siteFailureInParallel(ctx, false, &wg) - wg.Wait() + if rwxAccessMode { + dep1ReplicaCount += 3 + dep2ReplicaCount += 3 + err = updateDeploymentReplicawithWait(client, dep1ReplicaCount, deployment1.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + err = updateDeploymentReplicawithWait(client, dep2ReplicaCount, deployment2.Name, namespace) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) - defer func() { - ginkgo.By("Bring up the secondary site before terminating the test") - if len(fds.hostsDown) > 0 { - siteRestore(false) - fds.hostsDown = nil + } else { + sts1Replicas += 2 + ginkgo.By(fmt.Sprintf("Scaling up statefulset %v to number of Replica: %v", statefulset1.Name, sts1Replicas)) + fss.UpdateReplicas(ctx, client, statefulset1, sts1Replicas) + + sts2Replicas -= 2 + ginkgo.By(fmt.Sprintf("Scaling down statefulset: %v to number of Replica: %v", statefulset2.Name, sts2Replicas)) + fss.UpdateReplicas(ctx, client, statefulset2, sts2Replicas) } - }() - ginkgo.By("Wait for k8s cluster to be healthy") - wait4AllK8sNodesToBeUp(ctx, client, nodeList) - err = waitForAllNodes2BeReady(ctx, client, pollTimeout*4) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Bring down the secondary site") + psodHostsOnSite(false, "600") - // Check if csi pods are running fine after site failure - err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout*2) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + defer func() { + ginkgo.By("Bring up the primary site before terminating the test") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + siteRestore(true) + fds.hostsDown = nil + } + }() + + if vanillaCluster { + wait4AllK8sNodesToBeUp(nodeList) + } + if vanillaCluster || guestCluster { + err = waitForAllNodes2BeReady(ctx, client) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + time.Sleep(5 * time.Minute) - ginkgo.By("Trigger 2 full syncs as full sync might be interrupted during site failover") - triggerFullSync(ctx, cnsOperatorClient) + if guestCluster { + ginkgo.By("Check for nodes to be in Ready state in supervisor") + err = fpod.WaitForPodsRunningReady(ctx, svcClient, csiNs, int32(svcCsipods.Size()), 0, pollTimeout) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } - ginkgo.By("Checking whether pods are in Running state") - for _, pod := range pods { - framework.Logf("Pod is %s", pod.Name) - err = waitForPodsToBeInErrorOrRunning(ctx, client, pod.Name, namespace, pollTimeout*4) + ginkgo.By("Check if csi pods are running fine after site recovery") + err = fpod.WaitForPodsRunningReady(ctx, client, csiNs, int32(csipods.Size()), 0, pollTimeout) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } - for _, pvc := range allPvcs.Items { - ginkgo.By(fmt.Sprintf("Verifying labels %+v are updated for pvc %s in namespace %s", - labels, pvc.Name, namespace)) - pv := getPvFromClaim(client, namespace, pvc.Name) - err = e2eVSphere.verifyLabelsAreUpdated(pv.Spec.CSI.VolumeHandle, labels, - string(cnstypes.CnsKubernetesEntityTypePV), pv.Name, pv.Namespace) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + // Statefulset and deployments in PodVM might got to Terminating state as + // the nodes attached to these pods might become inaccessible during site failure. + // Hence validating these steps once site is restored back. + if !supervisorCluster { - } + if rwxAccessMode { + dep1ReplicaCount += 3 + updateDeploymentReplicawithWait(client, dep2ReplicaCount, deployment2.Name, namespace) + verifyVolumeMetadataOnDeployments(ctx, client, deployment2, namespace, nil, nil, + nil, "") + dep2ReplicaCount += 3 + updateDeploymentReplicawithWait(client, dep2ReplicaCount, deployment2.Name, namespace) + verifyVolumeMetadataOnDeployments(ctx, client, deployment2, namespace, nil, nil, + nil, "") + } else { - for _, pv := range allPvs.Items { - ginkgo.By(fmt.Sprintf("Verifying labels %+v are updated for pv %s", - labels, pv.Name)) - err = e2eVSphere.verifyLabelsAreUpdated(pv.Spec.CSI.VolumeHandle, labels, - string(cnstypes.CnsKubernetesEntityTypePV), pv.Name, pv.Namespace) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Verifying statefulset scale up/down went fine on sts1 and sts2") + // Scale up replicas of statefulset1 and verify CNS entries for volumes + scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset1, + sts1Replicas, false, true) + // Scale down replicas of statefulset2 and verify CNS entries for volumes + scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset2, + ss2PodsBeforeScaleDown, sts2Replicas, false, true) + + // Scaling up statefulset sts1 + sts1Replicas -= 4 + scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset1, + sts1Replicas, true, false) + + // Scaling down statefulset sts2 + sts2Replicas += 5 + scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset2, + ss2PodsBeforeScaleDown, sts2Replicas, true, false) + } - } + } + + ginkgo.By("Bring up the primary site") + if len(fds.hostsDown) > 0 && fds.hostsDown != nil { + siteRestore(true) + fds.hostsDown = nil + } - // Deleting all pods in namespace - for _, pod := range pods { - ginkgo.By(fmt.Sprintf("Deleting the pod %s in namespace %s", pod.Name, namespace)) - err = fpod.DeletePodWithWait(ctx, client, pod) + err = waitForAllNodes2BeReady(ctx, client) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - } - ginkgo.By("Bring up the secondary site") - if len(fds.hostsDown) > 0 { - siteRestore(false) - fds.hostsDown = []string{} - } + if supervisorCluster { - ginkgo.By("Wait for k8s cluster to be healthy") - // wait for the VMs to move back - err = waitForAllNodes2BeReady(ctx, client, pollTimeout*4) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) + if rwxAccessMode { + dep1ReplicaCount += 3 + updateDeploymentReplicawithWait(client, dep2ReplicaCount, deployment2.Name, namespace) + verifyVolumeMetadataOnDeployments(ctx, client, deployment2, namespace, nil, nil, + nil, "") + dep2ReplicaCount += 3 + updateDeploymentReplicawithWait(client, dep2ReplicaCount, deployment2.Name, namespace) + verifyVolumeMetadataOnDeployments(ctx, client, deployment2, namespace, nil, nil, + nil, "") + } else { + + ginkgo.By("Verifying statefulset scale up/down went fine on sts1 and sts2") + // Scale up replicas of statefulset1 and verify CNS entries for volumes + scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset1, + sts1Replicas, false, true) + // Scale down replicas of statefulset2 and verify CNS entries for volumes + scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset2, + ss2PodsBeforeScaleDown, sts2Replicas, false, true) + + // Scaling up statefulset sts1 + sts1Replicas -= 4 + scaleUpStsAndVerifyPodMetadata(ctx, client, namespace, statefulset1, + sts1Replicas, true, false) + + // Scaling down statefulset sts2 + sts2Replicas += 5 + scaleDownStsAndVerifyPodMetadata(ctx, client, namespace, statefulset2, + ss2PodsBeforeScaleDown, sts2Replicas, true, false) + } + + } + + }) - }) }) diff --git a/tests/e2e/vsan_stretched_cluster_utils.go b/tests/e2e/vsan_stretched_cluster_utils.go index 7a41ac698f..f8eb015da0 100644 --- a/tests/e2e/vsan_stretched_cluster_utils.go +++ b/tests/e2e/vsan_stretched_cluster_utils.go @@ -654,24 +654,19 @@ func toggleWitnessPowerState(ctx context.Context, witnessHostDown bool) { // checkVmStorageCompliance checks VM and storage compliance of a storage policy // using govmomi func checkVmStorageCompliance(client clientset.Interface, storagePolicy string) bool { - ctx, cancel := context.WithCancel(context.Background()) + _, cancel := context.WithCancel(context.Background()) defer cancel() - masterIp := getK8sMasterIPs(ctx, client) + vcAddress := e2eVSphere.Config.Global.VCenterHostname - nimbusGeneratedK8sVmPwd := GetAndExpectStringEnvVar(nimbusK8sVmPwd) vcAdminPwd := GetAndExpectStringEnvVar(vcUIPwd) - sshClientConfig := &ssh.ClientConfig{ - User: "root", - Auth: []ssh.AuthMethod{ - ssh.Password(nimbusGeneratedK8sVmPwd), - }, - HostKeyCallback: ssh.InsecureIgnoreHostKey(), - } + cmd := "export GOVC_INSECURE=1;" cmd += fmt.Sprintf("export GOVC_URL='https://administrator@vsphere.local:%s@%s';", vcAdminPwd, vcAddress) cmd += fmt.Sprintf("govc storage.policy.info -c -s %s;", storagePolicy) - _, err := sshExec(sshClientConfig, masterIp[0], cmd) + framework.Logf("Running command: %s", cmd) + result, err := exec.Command("/bin/bash", "-c", cmd).Output() + framework.Logf("res is: %v", result) return strings.Contains(err.Error(), "object references is empty") } @@ -980,7 +975,6 @@ func scaleDownStsAndVerifyPodMetadata(ctx context.Context, client clientset.Inte pvclaim, err := client.CoreV1().PersistentVolumeClaims(namespace).Get(ctx, pvcName, metav1.GetOptions{}) gomega.Expect(pvclaim).NotTo(gomega.BeNil()) - gomega.Expect(err).NotTo(gomega.HaveOccurred()) err = waitAndVerifyCnsVolumeMetadata4GCVol(ctx, volHandle, svcPVCName, pvclaim, pv, &sspod) gomega.Expect(err).NotTo(gomega.HaveOccurred()) @@ -1283,3 +1277,69 @@ func checkForEventWithMessage(client clientset.Interface, namespace string, } return eventFound } + +// psodHostsOnSite executes PSOD operation on the hosts of the given site +func psodHostsOnSite(primarySite bool, psodTimeout string) { + hosts := fds.secondarySiteHosts + if primarySite { + hosts = fds.primarySiteHosts + } + + for _, host := range hosts { + psodHost(host, psodTimeout) + } +} + +// psodHostsInParallel is a rapper method for psodHostsOnSite method +// which performs PSOD operation on the hosts of the given fault domain concurrently +func psodHostsInParallel(primarySite bool, psodTimeout string, wg *sync.WaitGroup) { + defer wg.Done() + psodHostsOnSite(primarySite, psodTimeout) +} + +// createStaticPvAndPvcInGuestClusterInParallel creates PV and PVC in a guest cluster +// from a volume created in supervsior cluster concurrently +func createStaticPvAndPvcInGuestClusterInParallel(client clientset.Interface, ctx context.Context, + namespace string, svcPVCNames []string, storageClassName string, + ch chan *v1.PersistentVolumeClaim, lock *sync.Mutex, wg *sync.WaitGroup) { + defer ginkgo.GinkgoRecover() + defer wg.Done() + for _, svcPVCName := range svcPVCNames { + framework.Logf("Volume Handle :%s", svcPVCName) + + ginkgo.By("Creating PV in guest cluster") + gcPV := getPersistentVolumeSpecWithStorageclass(svcPVCName, + v1.PersistentVolumeReclaimRetain, storageClassName, nil, diskSize) + gcPV, err := client.CoreV1().PersistentVolumes().Create(ctx, gcPV, metav1.CreateOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + gcPVName := gcPV.GetName() + time.Sleep(time.Duration(10) * time.Second) + framework.Logf("PV name in GC : %s", gcPVName) + + ginkgo.By("Creating PVC in guest cluster") + gcPVC := getPVCSpecWithPVandStorageClass(svcPVCName, namespace, nil, gcPVName, storageClassName, diskSize) + gcPVC, err = client.CoreV1().PersistentVolumeClaims(namespace).Create(ctx, gcPVC, metav1.CreateOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + lock.Lock() + ch <- gcPVC + lock.Unlock() + } +} + +// createCNSRegisterVolumeInParallel registers FCDs in CNS by calling +// createCNSRegisterVolume API concurrently +func createCNSRegisterVolumeInParallel(ctx context.Context, namespace string, + fcdIDs []string, pvcNames []string, wg *sync.WaitGroup) { + + defer wg.Done() + for i := range fcdIDs { + cnsRegisterVolume := getCNSRegisterVolumeSpec(ctx, namespace, fcdIDs[i], "", pvcNames[i], v1.ReadWriteOnce) + err := createCNSRegisterVolume(ctx, restConfig, cnsRegisterVolume) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + framework.ExpectNoError(waitForCNSRegisterVolumeToGetCreated(ctx, + restConfig, namespace, cnsRegisterVolume, poll, supervisorClusterOperationsTimeout)) + cnsRegisterVolumeName := cnsRegisterVolume.GetName() + framework.Logf("CNS register volume name : %s", cnsRegisterVolumeName) + } +}