From 49a0c849dbced1688ae2b64adca25c16aeecc720 Mon Sep 17 00:00:00 2001 From: Houston Putman Date: Fri, 14 Jun 2024 15:09:31 -0500 Subject: [PATCH 1/5] Add PVC volume expansion --- controllers/solr_cluster_ops_util.go | 59 ++++++++++++++++++++++- controllers/solrcloud_controller.go | 71 ++++++++++++++++++++++++---- controllers/util/solr_util.go | 24 ++++++++++ 3 files changed, 143 insertions(+), 11 deletions(-) diff --git a/controllers/solr_cluster_ops_util.go b/controllers/solr_cluster_ops_util.go index 916446b3..fa1952a5 100644 --- a/controllers/solr_cluster_ops_util.go +++ b/controllers/solr_cluster_ops_util.go @@ -27,6 +27,7 @@ import ( "github.com/go-logr/logr" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/utils/pointer" "net/url" @@ -53,6 +54,7 @@ const ( ScaleUpLock SolrClusterOperationType = "ScalingUp" UpdateLock SolrClusterOperationType = "RollingUpdate" BalanceReplicasLock SolrClusterOperationType = "BalanceReplicas" + PvcExpansionLock SolrClusterOperationType = "PVCExpansion" ) // RollingUpdateMetadata contains metadata for rolling update cluster operations. @@ -150,6 +152,60 @@ func retryNextQueuedClusterOpWithQueue(statefulSet *appsv1.StatefulSet, clusterO return hasOp, err } +func determinePvcExpansionClusterOpLockIfNecessary(instance *solrv1beta1.SolrCloud, statefulSet *appsv1.StatefulSet) (clusterOp *SolrClusterOp, retryLaterDuration time.Duration, err error) { + if instance.Spec.StorageOptions.PersistentStorage != nil && + instance.Spec.StorageOptions.PersistentStorage.PersistentVolumeClaimTemplate.Spec.Resources.Requests.Storage() != nil && + instance.Spec.StorageOptions.PersistentStorage.PersistentVolumeClaimTemplate.Spec.Resources.Requests.Storage().String() != statefulSet.Annotations[util.StorageMinimumSizeAnnotation] { + // First make sure that the new Storage request is greater than what already is set. + // PVCs cannot be shrunk + newSize := instance.Spec.StorageOptions.PersistentStorage.PersistentVolumeClaimTemplate.Spec.Resources.Requests.Storage() + // If there is no old size to update, the StatefulSet can be just set to use the new PVC size without any issue. + // Only do a cluster operation if we are expanding from an existing size to a new size + if oldSizeStr, hasOldSize := statefulSet.Annotations[util.StorageMinimumSizeAnnotation]; hasOldSize { + if oldSize, e := resource.ParseQuantity(oldSizeStr); e != nil { + err = e + // TODO: add an event + } else { + // Only update to the new size if it is bigger, we cannot shrink PVCs + if newSize.Cmp(oldSize) > 0 { + clusterOp = &SolrClusterOp{ + Operation: PvcExpansionLock, + Metadata: newSize.String(), + } + } + // TODO: add an event saying that we cannot shrink PVCs + } + } + } + return +} + +// handleManagedCloudScaleUp does the logic of a managed and "locked" cloud scale up operation. +// This will likely take many reconcile loops to complete, as it is moving replicas to the pods that have recently been scaled up. +func handlePvcExpansion(ctx context.Context, r *SolrCloudReconciler, instance *solrv1beta1.SolrCloud, statefulSet *appsv1.StatefulSet, clusterOp *SolrClusterOp, logger logr.Logger) (operationComplete bool, retryLaterDuration time.Duration, err error) { + var newSize resource.Quantity + newSize, err = resource.ParseQuantity(clusterOp.Metadata) + if err != nil { + logger.Error(err, "Could not convert PvcExpansion metadata to a resource.Quantity, as it represents the new size of PVCs", "metadata", clusterOp.Metadata) + return + } + operationComplete, err = r.expandPVCs(ctx, instance, statefulSet.Spec.Selector.MatchLabels, newSize, logger) + if err == nil && operationComplete { + originalStatefulSet := statefulSet.DeepCopy() + statefulSet.Annotations[util.StorageMinimumSizeAnnotation] = newSize.String() + statefulSet.Spec.Template.Annotations[util.StorageMinimumSizeAnnotation] = newSize.String() + if err = r.Patch(ctx, statefulSet, client.StrategicMergeFrom(originalStatefulSet)); err != nil { + logger.Error(err, "Error while patching StatefulSet to set the new minimum PVC size after PVCs the completion of PVC resizing", "newSize", newSize) + operationComplete = false + } + // Return and wait for the StatefulSet to be updated which will call the reconcile to start the rolling restart + retryLaterDuration = 0 + } else if err == nil { + retryLaterDuration = time.Second * 5 + } + return +} + func determineScaleClusterOpLockIfNecessary(ctx context.Context, r *SolrCloudReconciler, instance *solrv1beta1.SolrCloud, statefulSet *appsv1.StatefulSet, scaleDownOpIsQueued bool, podList []corev1.Pod, blockReconciliationOfStatefulSet bool, logger logr.Logger) (clusterOp *SolrClusterOp, retryLaterDuration time.Duration, err error) { desiredPods := int(*instance.Spec.Replicas) configuredPods := int(*statefulSet.Spec.Replicas) @@ -291,7 +347,8 @@ func cleanupManagedCloudScaleDown(ctx context.Context, r *SolrCloudReconciler, p // handleManagedCloudScaleUp does the logic of a managed and "locked" cloud scale up operation. // This will likely take many reconcile loops to complete, as it is moving replicas to the pods that have recently been scaled up. func handleManagedCloudScaleUp(ctx context.Context, r *SolrCloudReconciler, instance *solrv1beta1.SolrCloud, statefulSet *appsv1.StatefulSet, clusterOp *SolrClusterOp, podList []corev1.Pod, logger logr.Logger) (operationComplete bool, nextClusterOperation *SolrClusterOp, err error) { - desiredPods, err := strconv.Atoi(clusterOp.Metadata) + desiredPods := 0 + desiredPods, err = strconv.Atoi(clusterOp.Metadata) if err != nil { logger.Error(err, "Could not convert ScaleUp metadata to int, as it represents the number of nodes to scale to", "metadata", clusterOp.Metadata) return diff --git a/controllers/solrcloud_controller.go b/controllers/solrcloud_controller.go index 9940ff9e..5e87d166 100644 --- a/controllers/solrcloud_controller.go +++ b/controllers/solrcloud_controller.go @@ -22,6 +22,7 @@ import ( "crypto/md5" "fmt" policyv1 "k8s.io/api/policy/v1" + "k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/runtime" "reflect" "sort" @@ -483,6 +484,8 @@ func (r *SolrCloudReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( operationComplete, nextClusterOperation, err = handleManagedCloudScaleUp(ctx, r, instance, statefulSet, clusterOp, podList, logger) case BalanceReplicasLock: operationComplete, requestInProgress, retryLaterDuration, err = util.BalanceReplicasForCluster(ctx, instance, statefulSet, clusterOp.Metadata, clusterOp.Metadata, logger) + case PvcExpansionLock: + operationComplete, retryLaterDuration, err = handlePvcExpansion(ctx, r, instance, statefulSet, clusterOp, logger) default: operationFound = false // This shouldn't happen, but we don't want to be stuck if it does. @@ -550,6 +553,12 @@ func (r *SolrCloudReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( clusterOpQueue[queueIdx] = *clusterOp clusterOp = nil } + clusterOp, retryLaterDuration, err = determinePvcExpansionClusterOpLockIfNecessary(instance, statefulSet) + // If the new clusterOperation is an update to a queued clusterOp, just change the operation that is already queued + if queueIdx, opIsQueued := queuedRetryOps[UpdateLock]; clusterOp != nil && opIsQueued { + clusterOpQueue[queueIdx] = *clusterOp + clusterOp = nil + } // If a non-managed scale needs to take place, this method will update the StatefulSet without starting // a "locked" cluster operation @@ -932,6 +941,46 @@ func (r *SolrCloudReconciler) reconcileZk(ctx context.Context, logger logr.Logge return nil } +func (r *SolrCloudReconciler) expandPVCs(ctx context.Context, cloud *solrv1beta1.SolrCloud, pvcLabelSelector map[string]string, newSize resource.Quantity, logger logr.Logger) (expansionComplete bool, err error) { + var pvcList corev1.PersistentVolumeClaimList + pvcList, err = r.getPVCList(ctx, cloud, pvcLabelSelector) + if err != nil { + return + } + expansionCompleteCount := 0 + for _, pvcItem := range pvcList.Items { + if pvcExpansionComplete, e := r.expandPVC(ctx, &pvcItem, newSize, logger); e != nil { + err = e + } else if pvcExpansionComplete { + expansionCompleteCount += 1 + } + } + // If all PVCs have been expanded, then we are done + expansionComplete = err == nil && expansionCompleteCount == len(pvcList.Items) + return +} + +func (r *SolrCloudReconciler) expandPVC(ctx context.Context, pvc *corev1.PersistentVolumeClaim, newSize resource.Quantity, logger logr.Logger) (expansionComplete bool, err error) { + // If the current capacity is >= the new size, then there is nothing to do, expansion is complete + if pvc.Status.Capacity.Storage().Cmp(newSize) >= 0 { + // TODO: Eventually use the pvc.Status.AllocatedResources and pvc.Status.AllocatedResourceStatuses to determine the status of PVC Expansion and react to failures + expansionComplete = true + } else if !pvc.Spec.Resources.Requests.Storage().Equal(newSize) { + // Update the pvc if the capacity request is different. + // The newSize might be smaller than the current size, but this is supported as the last size might have been too + // big for the storage quota, so it was lowered. + // As long as the PVCs current capacity is lower than the new size, we are still good to update the PVC. + originalPvc := pvc.DeepCopy() + pvc.Spec.Resources.Requests[corev1.ResourceStorage] = newSize + if err = r.Patch(ctx, pvc, client.StrategicMergeFrom(originalPvc)); err != nil { + logger.Error(err, "Error while expanding PersistentVolumeClaim size", "persistentVolumeClaim", pvc.Name, "size", newSize) + } else { + logger.Info("Expanded PersistentVolumeClaim size", "persistentVolumeClaim", pvc.Name, "size", newSize) + } + } + return +} + // Logic derived from: // - https://book.kubebuilder.io/reference/using-finalizers.html // - https://github.com/pravega/zookeeper-operator/blob/v0.2.9/pkg/controller/zookeepercluster/zookeepercluster_controller.go#L629 @@ -978,16 +1027,15 @@ func (r *SolrCloudReconciler) reconcileStorageFinalizer(ctx context.Context, clo return nil } -func (r *SolrCloudReconciler) getPVCCount(ctx context.Context, cloud *solrv1beta1.SolrCloud, pvcLabelSelector map[string]string) (pvcCount int, err error) { +func (r *SolrCloudReconciler) getPVCCount(ctx context.Context, cloud *solrv1beta1.SolrCloud, pvcLabelSelector map[string]string) (int, error) { pvcList, err := r.getPVCList(ctx, cloud, pvcLabelSelector) if err != nil { return -1, err } - pvcCount = len(pvcList.Items) - return pvcCount, nil + return len(pvcList.Items), nil } -func (r *SolrCloudReconciler) cleanupOrphanPVCs(ctx context.Context, cloud *solrv1beta1.SolrCloud, statefulSet *appsv1.StatefulSet, pvcLabelSelector map[string]string, logger logr.Logger) (err error) { +func (r *SolrCloudReconciler) cleanupOrphanPVCs(ctx context.Context, cloud *solrv1beta1.SolrCloud, statefulSet *appsv1.StatefulSet, pvcLabelSelector map[string]string, logger logr.Logger) error { // this check should make sure we do not delete the PVCs before the STS has scaled down if cloud.Status.ReadyReplicas == cloud.Status.Replicas { pvcList, err := r.getPVCList(ctx, cloud, pvcLabelSelector) @@ -1003,28 +1051,31 @@ func (r *SolrCloudReconciler) cleanupOrphanPVCs(ctx context.Context, cloud *solr // Don't use the Spec replicas here, because we might be rolling down 1-by-1 and the PVCs for // soon-to-be-deleted pods should not be deleted until the pod is deleted. if util.IsPVCOrphan(pvcItem.Name, *statefulSet.Spec.Replicas) { - r.deletePVC(ctx, pvcItem, logger) + if e := r.deletePVC(ctx, pvcItem, logger); e != nil { + err = e + } } } } + return err } return nil } -func (r *SolrCloudReconciler) getPVCList(ctx context.Context, cloud *solrv1beta1.SolrCloud, pvcLabelSelector map[string]string) (pvList corev1.PersistentVolumeClaimList, err error) { +func (r *SolrCloudReconciler) getPVCList(ctx context.Context, cloud *solrv1beta1.SolrCloud, pvcLabelSelector map[string]string) (corev1.PersistentVolumeClaimList, error) { selector, err := metav1.LabelSelectorAsSelector(&metav1.LabelSelector{ MatchLabels: pvcLabelSelector, }) - pvclistOps := &client.ListOptions{ + pvcListOps := &client.ListOptions{ Namespace: cloud.Namespace, LabelSelector: selector, } pvcList := &corev1.PersistentVolumeClaimList{} - err = r.Client.List(ctx, pvcList, pvclistOps) + err = r.Client.List(ctx, pvcList, pvcListOps) return *pvcList, err } -func (r *SolrCloudReconciler) cleanUpAllPVCs(ctx context.Context, cloud *solrv1beta1.SolrCloud, pvcLabelSelector map[string]string, logger logr.Logger) (err error) { +func (r *SolrCloudReconciler) cleanUpAllPVCs(ctx context.Context, cloud *solrv1beta1.SolrCloud, pvcLabelSelector map[string]string, logger logr.Logger) error { pvcList, err := r.getPVCList(ctx, cloud, pvcLabelSelector) if err != nil { return err @@ -1032,7 +1083,7 @@ func (r *SolrCloudReconciler) cleanUpAllPVCs(ctx context.Context, cloud *solrv1b for _, pvcItem := range pvcList.Items { r.deletePVC(ctx, pvcItem, logger) } - return nil + return err } func (r *SolrCloudReconciler) deletePVC(ctx context.Context, pvcItem corev1.PersistentVolumeClaim, logger logr.Logger) { diff --git a/controllers/util/solr_util.go b/controllers/util/solr_util.go index de44d7c1..a7111ff3 100644 --- a/controllers/util/solr_util.go +++ b/controllers/util/solr_util.go @@ -57,6 +57,7 @@ const ( // These are to be saved on a statefulSet update ClusterOpsLockAnnotation = "solr.apache.org/clusterOpsLock" ClusterOpsRetryQueueAnnotation = "solr.apache.org/clusterOpsRetryQueue" + StorageMinimumSizeAnnotation = "solr.apache.org/storageMinimumSize" SolrIsNotStoppedReadinessCondition = "solr.apache.org/isNotStopped" SolrReplicasNotEvictedReadinessCondition = "solr.apache.org/replicasNotEvicted" @@ -200,6 +201,13 @@ func GenerateStatefulSet(solrCloud *solr.SolrCloud, solrCloudStatus *solr.SolrCl Spec: pvc.Spec, }, } + if pvc.Spec.Resources.Requests.Storage() != nil { + annotations[StorageMinimumSizeAnnotation] = pvc.Spec.Resources.Requests.Storage().String() + if podAnnotations == nil { + podAnnotations = make(map[string]string, 1) + } + podAnnotations[StorageMinimumSizeAnnotation] = pvc.Spec.Resources.Requests.Storage().String() + } } else { ephemeralVolume := corev1.Volume{ Name: solrDataVolumeName, @@ -680,6 +688,22 @@ func MaintainPreservedStatefulSetFields(expected, found *appsv1.StatefulSet) { } expected.Annotations[ClusterOpsRetryQueueAnnotation] = queue } + if storage, hasStorage := found.Annotations[StorageMinimumSizeAnnotation]; hasStorage { + if expected.Annotations == nil { + expected.Annotations = make(map[string]string, 1) + } + expected.Annotations[StorageMinimumSizeAnnotation] = storage + } + } + if found.Spec.Template.Annotations != nil { + // Note: the Pod template storage annotation is used to start a rolling restart, + // it should always match the StatefulSet's storage annotation + if storage, hasStorage := found.Spec.Template.Annotations[StorageMinimumSizeAnnotation]; hasStorage { + if expected.Spec.Template.Annotations == nil { + expected.Spec.Template.Annotations = make(map[string]string, 1) + } + expected.Spec.Template.Annotations[StorageMinimumSizeAnnotation] = storage + } } // Scaling (i.e. changing) the number of replicas in the SolrCloud statefulSet is handled during the clusterOps From 7f8157782a5b0eb43664bfd21f6bce4c88e8cecd Mon Sep 17 00:00:00 2001 From: Houston Putman Date: Fri, 14 Jun 2024 15:19:34 -0500 Subject: [PATCH 2/5] Fix error --- controllers/solrcloud_controller.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/controllers/solrcloud_controller.go b/controllers/solrcloud_controller.go index 5e87d166..f05e3512 100644 --- a/controllers/solrcloud_controller.go +++ b/controllers/solrcloud_controller.go @@ -1051,9 +1051,7 @@ func (r *SolrCloudReconciler) cleanupOrphanPVCs(ctx context.Context, cloud *solr // Don't use the Spec replicas here, because we might be rolling down 1-by-1 and the PVCs for // soon-to-be-deleted pods should not be deleted until the pod is deleted. if util.IsPVCOrphan(pvcItem.Name, *statefulSet.Spec.Replicas) { - if e := r.deletePVC(ctx, pvcItem, logger); e != nil { - err = e - } + r.deletePVC(ctx, pvcItem, logger) } } } From 83fc30e7419268d777d110fc9a9173db914a9af2 Mon Sep 17 00:00:00 2001 From: Houston Putman Date: Thu, 20 Jun 2024 14:08:22 -0500 Subject: [PATCH 3/5] Add PVC permissions --- config/rbac/role.yaml | 2 ++ controllers/solrcloud_controller.go | 2 +- helm/solr-operator/templates/role.yaml | 2 ++ 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index c4f9b41f..45f6d561 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -44,6 +44,8 @@ rules: - delete - get - list + - patch + - update - watch - apiGroups: - "" diff --git a/controllers/solrcloud_controller.go b/controllers/solrcloud_controller.go index f05e3512..8e2069b7 100644 --- a/controllers/solrcloud_controller.go +++ b/controllers/solrcloud_controller.go @@ -73,7 +73,7 @@ func UseZkCRD(useCRD bool) { //+kubebuilder:rbac:groups=networking.k8s.io,resources=ingresses/status,verbs=get //+kubebuilder:rbac:groups="",resources=configmaps,verbs=get;list;watch;create;update;patch;delete //+kubebuilder:rbac:groups="",resources=configmaps/status,verbs=get -//+kubebuilder:rbac:groups="",resources=persistentvolumeclaims,verbs=get;list;watch;delete +//+kubebuilder:rbac:groups="",resources=persistentvolumeclaims,verbs=get;list;watch;update;patch;delete //+kubebuilder:rbac:groups=policy,resources=poddisruptionbudgets,verbs=get;list;watch;create;update;patch;delete //+kubebuilder:rbac:groups=zookeeper.pravega.io,resources=zookeeperclusters,verbs=get;list;watch;create;update;patch;delete //+kubebuilder:rbac:groups=zookeeper.pravega.io,resources=zookeeperclusters/status,verbs=get diff --git a/helm/solr-operator/templates/role.yaml b/helm/solr-operator/templates/role.yaml index da950e4e..f7a28ac0 100644 --- a/helm/solr-operator/templates/role.yaml +++ b/helm/solr-operator/templates/role.yaml @@ -48,6 +48,8 @@ rules: - delete - get - list + - patch + - update - watch - apiGroups: - "" From 069c684bf00dbd3e2c4fe1518577ef62628c6ca1 Mon Sep 17 00:00:00 2001 From: Houston Putman Date: Thu, 20 Jun 2024 14:26:05 -0500 Subject: [PATCH 4/5] Add changelog and update docs --- docs/solr-cloud/solr-cloud-crd.md | 5 +++-- docs/upgrade-notes.md | 2 +- helm/solr/Chart.yaml | 9 +++------ 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/docs/solr-cloud/solr-cloud-crd.md b/docs/solr-cloud/solr-cloud-crd.md index c1053cd1..79ba9af9 100644 --- a/docs/solr-cloud/solr-cloud-crd.md +++ b/docs/solr-cloud/solr-cloud-crd.md @@ -61,8 +61,9 @@ These options can be found in `SolrCloud.spec.dataStorage` - **`pvcTemplate`** - The template of the PVC to use for the solr data PVCs. By default the name will be "data". Only the `pvcTemplate.spec` field is required, metadata is optional. - Note: This template cannot be changed unless the SolrCloud is deleted and recreated. - This is a [limitation of StatefulSets and PVCs in Kubernetes](https://github.com/kubernetes/enhancements/issues/661). + Note: Currently, [Kubernetes does not support PVC resizing (expanding) in StatefulSets](https://github.com/kubernetes/enhancements/issues/661). + However, The Solr Operator will manage the PVC expansion for users until this is supported by default in Kubernetes. + Therefore the `pvcTemplate.spec` can have an update to `pvcTemplate.spec.resources.requests`, but all other fields should be considered immutable. - **`ephemeral`** There are two types of ephemeral volumes that can be specified. diff --git a/docs/upgrade-notes.md b/docs/upgrade-notes.md index 448dfb24..32fedc58 100644 --- a/docs/upgrade-notes.md +++ b/docs/upgrade-notes.md @@ -124,7 +124,7 @@ _Note that the Helm chart version does not contain a `v` prefix, which the downl ### v0.8.0 - **The minimum supported Solr version is now 8.11** If you are unable to use a newer version of Solr, please install the `v0.7.1` version of the Solr Operator. - However, it is strongly suggested to upgrade to newer versions of Solr that are actively supported.q + However, it is strongly suggested to upgrade to newer versions of Solr that are actively supported. See the [version compatibility matrix](#solr-versions) for more information. - **Kubernetes support is now limited to 1.22+.** diff --git a/helm/solr/Chart.yaml b/helm/solr/Chart.yaml index f93a1182..03e3bf00 100644 --- a/helm/solr/Chart.yaml +++ b/helm/solr/Chart.yaml @@ -42,15 +42,12 @@ annotations: # Allowed syntax is described at: https://artifacthub.io/docs/topics/annotations/helm/#example artifacthub.io/changes: | - kind: added - description: Addition 1 + description: Allow resizing (expanding) of persistent data PVCs links: - name: Github Issue - url: https://github.com/issue-url - - kind: changed - description: Change 2 - links: + url: https://github.com/apache/solr-operator/issues/709 - name: Github PR - url: https://github.com/pr-url + url: https://github.com/apache/solr-operator/pull/712 artifacthub.io/containsSecurityUpdates: "false" artifacthub.io/recommendations: | - url: https://artifacthub.io/packages/helm/apache-solr/solr-operator From 441931fc6315c12d59fa8c94a52b72984d548301 Mon Sep 17 00:00:00 2001 From: Houston Putman Date: Thu, 5 Sep 2024 17:51:55 -0500 Subject: [PATCH 5/5] Add integration test. Cannot test yet, since volume expansion is not supported --- tests/e2e/solrcloud_storage_test.go | 151 ++++++++++++++++++++++++++++ tests/e2e/suite_test.go | 43 +++++++- tests/scripts/manage_e2e_tests.sh | 3 + 3 files changed, 196 insertions(+), 1 deletion(-) create mode 100644 tests/e2e/solrcloud_storage_test.go diff --git a/tests/e2e/solrcloud_storage_test.go b/tests/e2e/solrcloud_storage_test.go new file mode 100644 index 00000000..94289554 --- /dev/null +++ b/tests/e2e/solrcloud_storage_test.go @@ -0,0 +1,151 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package e2e + +import ( + "context" + solrv1beta1 "github.com/apache/solr-operator/api/v1beta1" + "github.com/apache/solr-operator/controllers" + "github.com/apache/solr-operator/controllers/util" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/apimachinery/pkg/labels" + "sigs.k8s.io/controller-runtime/pkg/client" + "time" +) + +var _ = FDescribe("E2E - SolrCloud - Storage", func() { + var ( + solrCloud *solrv1beta1.SolrCloud + + solrCollection1 = "e2e-1" + + solrCollection2 = "e2e-2" + ) + + BeforeEach(func() { + solrCloud = generateBaseSolrCloud(2) + }) + + JustBeforeEach(func(ctx context.Context) { + By("creating the SolrCloud") + Expect(k8sClient.Create(ctx, solrCloud)).To(Succeed()) + + DeferCleanup(func(ctx context.Context) { + cleanupTest(ctx, solrCloud) + }) + + By("Waiting for the SolrCloud to come up healthy") + solrCloud = expectSolrCloudToBeReady(ctx, solrCloud) + + By("creating a first Solr Collection") + createAndQueryCollection(ctx, solrCloud, solrCollection1, 1, 2) + + By("creating a second Solr Collection") + createAndQueryCollection(ctx, solrCloud, solrCollection2, 2, 1) + }) + + FContext("Persistent Data - Expansion", func() { + BeforeEach(func() { + solrCloud.Spec.StorageOptions = solrv1beta1.SolrDataStorageOptions{ + PersistentStorage: &solrv1beta1.SolrPersistentDataStorageOptions{ + PersistentVolumeClaimTemplate: solrv1beta1.PersistentVolumeClaimTemplate{ + Spec: corev1.PersistentVolumeClaimSpec{ + Resources: corev1.ResourceRequirements{ + Requests: map[corev1.ResourceName]resource.Quantity{ + corev1.ResourceStorage: resource.MustParse("1G"), + }, + }, + }, + }, + }, + } + }) + + FIt("Fully Expands", func(ctx context.Context) { + newStorageSize := resource.MustParse("1500M") + patchedSolrCloud := solrCloud.DeepCopy() + patchedSolrCloud.Spec.StorageOptions.PersistentStorage.PersistentVolumeClaimTemplate.Spec.Resources.Requests[corev1.ResourceStorage] = newStorageSize + By("triggering a rolling restart via pod annotations") + Expect(k8sClient.Patch(ctx, patchedSolrCloud, client.MergeFrom(solrCloud))).To(Succeed(), "Could not add annotation to SolrCloud pod to initiate rolling restart") + + // Wait for new pods to come up, and when they do we should be doing a balanceReplicas clusterOp + expectStatefulSetWithChecksAndTimeout(ctx, solrCloud, solrCloud.StatefulSetName(), time.Second*5, time.Millisecond*50, func(g Gomega, found *appsv1.StatefulSet) { + clusterOp, err := controllers.GetCurrentClusterOp(found) + g.Expect(err).ToNot(HaveOccurred(), "Error occurred while finding clusterLock for SolrCloud") + g.Expect(clusterOp).ToNot(BeNil(), "StatefulSet does not have a PvcExpansion lock.") + g.Expect(clusterOp.Operation).To(Equal(controllers.PvcExpansionLock), "StatefulSet does not have a PvcExpansion lock after starting managed update.") + }) + + By("waiting for the expansion's rolling restart to begin") + solrCloud = expectSolrCloudWithChecksAndTimeout(ctx, solrCloud, time.Second*30, time.Millisecond*100, func(g Gomega, found *solrv1beta1.SolrCloud) { + g.Expect(found.Status.UpToDateNodes).To(BeZero(), "Cloud did not get to a state with zero up-to-date replicas when rolling restart began.") + for _, nodeStatus := range found.Status.SolrNodes { + g.Expect(nodeStatus.SpecUpToDate).To(BeFalse(), "Node not starting as out-of-date when rolling restart begins: %s", nodeStatus.Name) + } + }) + + By("checking that all PVCs have been expanded when the restart begins") + internalLabels := map[string]string{ + util.SolrPVCTechnologyLabel: util.SolrCloudPVCTechnology, + util.SolrPVCStorageLabel: util.SolrCloudPVCDataStorage, + util.SolrPVCInstanceLabel: solrCloud.Name, + } + pvcListOps := &client.ListOptions{ + Namespace: solrCloud.Namespace, + LabelSelector: labels.SelectorFromSet(internalLabels), + } + + foundPVCs := &corev1.PersistentVolumeClaimList{} + Expect(k8sClient.List(ctx, foundPVCs, pvcListOps)).To(Succeed(), "Could not fetch PVC list") + Expect(foundPVCs.Items).To(HaveLen(int(*solrCloud.Spec.Replicas)), "Did not find the same number of PVCs as Solr Pods") + for _, pvc := range foundPVCs.Items { + Expect(pvc.Spec.Resources).To(HaveKeyWithValue(corev1.ResourceStorage, newStorageSize), "The PVC %q does not have the new storage size in its resource requests", pvc.Name) + Expect(pvc.Status.Capacity).To(HaveKeyWithValue(corev1.ResourceStorage, newStorageSize), "The PVC %q does not have the new storage size in its status.capacity", pvc.Name) + } + + statefulSet := expectStatefulSetWithChecksAndTimeout(ctx, solrCloud, solrCloud.StatefulSetName(), 1, time.Millisecond, func(g Gomega, found *appsv1.StatefulSet) { + clusterOp, err := controllers.GetCurrentClusterOp(found) + g.Expect(err).ToNot(HaveOccurred(), "Error occurred while finding clusterLock for SolrCloud") + g.Expect(clusterOp).ToNot(BeNil(), "StatefulSet does not have a RollingUpdate lock.") + g.Expect(clusterOp.Operation).To(Equal(controllers.UpdateLock), "StatefulSet does not have a RollingUpdate lock after starting managed update to increase the storage size.") + g.Expect(clusterOp.Metadata).To(Equal(controllers.RollingUpdateMetadata{RequiresReplicaMigration: false}), "StatefulSet should not require replica migration, since PVCs are being used.") + }) + + By("waiting for the rolling restart to complete") + expectSolrCloudWithChecksAndTimeout(ctx, solrCloud, time.Second*90, time.Millisecond*5, func(g Gomega, cloud *solrv1beta1.SolrCloud) { + g.Expect(cloud.Status.UpToDateNodes).To(BeEquivalentTo(*statefulSet.Spec.Replicas), "The Rolling Update never completed, not all replicas up to date") + g.Expect(cloud.Status.ReadyReplicas).To(BeEquivalentTo(*statefulSet.Spec.Replicas), "The Rolling Update never completed, not all replicas ready") + }) + + By("waiting for the rolling restart to complete") + expectStatefulSetWithConsistentChecksAndDuration(ctx, solrCloud, solrCloud.StatefulSetName(), time.Second*2, func(g Gomega, found *appsv1.StatefulSet) { + clusterOp, err := controllers.GetCurrentClusterOp(found) + g.Expect(err).ToNot(HaveOccurred(), "Error occurred while finding clusterLock for SolrCloud") + g.Expect(clusterOp).To(BeNil(), "StatefulSet should not have any cluster lock after finishing its rolling update.") + }) + + By("checking that the collections can be queried after the restart") + queryCollection(ctx, solrCloud, solrCollection1, 0) + queryCollection(ctx, solrCloud, solrCollection2, 0) + }) + }) +}) diff --git a/tests/e2e/suite_test.go b/tests/e2e/suite_test.go index 1c2aec54..d15dde57 100644 --- a/tests/e2e/suite_test.go +++ b/tests/e2e/suite_test.go @@ -312,11 +312,26 @@ func writeAllSolrInfoToFiles(ctx context.Context, directory string, namespace st for _, pod := range foundPods.Items { writeAllPodInfoToFiles( ctx, - directory+pod.Name, + directory+pod.Name+".pod", &pod, ) } + listOps = &client.ListOptions{ + Namespace: namespace, + LabelSelector: labelSelector, + } + + foundPVCs := &corev1.PersistentVolumeClaimList{} + Expect(k8sClient.List(ctx, foundPVCs, listOps)).To(Succeed(), "Could not fetch Solr PVCs") + Expect(foundPVCs).ToNot(BeNil(), "No Solr PVCs could be found") + for _, pvc := range foundPVCs.Items { + writeAllPvcInfoToFiles( + directory+pvc.Name+".pvc", + &pvc, + ) + } + foundStatefulSets := &appsv1.StatefulSetList{} Expect(k8sClient.List(ctx, foundStatefulSets, listOps)).To(Succeed(), "Could not fetch Solr statefulSets") Expect(foundStatefulSets).ToNot(BeNil(), "No Solr statefulSet could be found") @@ -388,6 +403,32 @@ func writeAllStatefulSetInfoToFiles(baseFilename string, statefulSet *appsv1.Sta Expect(writeErr).ToNot(HaveOccurred(), "Could not write statefulSet events json to file") } +// writeAllPvcInfoToFiles writes the following each to a separate file with the given base name & directory. +// - PVC Spec/Status +// - PVC Events +func writeAllPvcInfoToFiles(baseFilename string, pvc *corev1.PersistentVolumeClaim) { + // Write PVC to a file + statusFile, err := os.Create(baseFilename + ".status.json") + defer statusFile.Close() + Expect(err).ToNot(HaveOccurred(), "Could not open file to save PVC status: %s", baseFilename+".status.json") + jsonBytes, marshErr := json.MarshalIndent(pvc, "", "\t") + Expect(marshErr).ToNot(HaveOccurred(), "Could not serialize PVC json") + _, writeErr := statusFile.Write(jsonBytes) + Expect(writeErr).ToNot(HaveOccurred(), "Could not write PVC json to file") + + // Write events for PVC to a file + eventsFile, err := os.Create(baseFilename + ".events.json") + defer eventsFile.Close() + Expect(err).ToNot(HaveOccurred(), "Could not open file to save PVC events: %s", baseFilename+".events.yaml") + + eventList, err := rawK8sClient.CoreV1().Events(pvc.Namespace).Search(scheme.Scheme, pvc) + Expect(err).ToNot(HaveOccurred(), "Could not find events for PVC: %s", pvc.Name) + jsonBytes, marshErr = json.MarshalIndent(eventList, "", "\t") + Expect(marshErr).ToNot(HaveOccurred(), "Could not serialize PVC events json") + _, writeErr = eventsFile.Write(jsonBytes) + Expect(writeErr).ToNot(HaveOccurred(), "Could not write PVC events json to file") +} + // writeAllServiceInfoToFiles writes the following each to a separate file with the given base name & directory. // - Service func writeAllServiceInfoToFiles(baseFilename string, service *corev1.Service) { diff --git a/tests/scripts/manage_e2e_tests.sh b/tests/scripts/manage_e2e_tests.sh index c42b92fb..61621324 100755 --- a/tests/scripts/manage_e2e_tests.sh +++ b/tests/scripts/manage_e2e_tests.sh @@ -167,6 +167,9 @@ function start_cluster() { echo "Create test Kubernetes ${KUBERNETES_VERSION} cluster in KinD. This will allow us to test the CRDs, Helm chart and the Docker image." kind create cluster --name "${CLUSTER_NAME}" --image "kindest/node:${KUBERNETES_VERSION}" --config "${SCRIPT_DIR}/e2e-kind-config.yaml" + # TODO: Remove when the following issue is resolved: https://github.com/kubernetes-sigs/kind/issues/3734 + kubectl patch storageclass standard -p '{"allowVolumeExpansion":true}' + setup_cluster }