From c9a1eac976c428917b2abd77dceb7ded3357042c Mon Sep 17 00:00:00 2001 From: "shenmu.wy" Date: Tue, 5 Dec 2023 10:32:12 +0800 Subject: [PATCH 1/6] bugfix: fix problems in scheduling volume group --- cmd/controller/antplugins/filter/localline.go | 4 +- .../v1/snapshot_webhook.go | 9 +- .../manager/reconciler/snapshot_reconciler.go | 9 +- .../manager/scheduler/sched_vol_group.go | 88 +++++++++---------- pkg/csi/client/client.go | 3 +- pkg/util/const.go | 4 + 6 files changed, 57 insertions(+), 60 deletions(-) diff --git a/cmd/controller/antplugins/filter/localline.go b/cmd/controller/antplugins/filter/localline.go index dbd3210..183c606 100644 --- a/cmd/controller/antplugins/filter/localline.go +++ b/cmd/controller/antplugins/filter/localline.go @@ -4,12 +4,12 @@ import ( v1 "code.alipay.com/dbplatform/node-disk-controller/pkg/api/volume.antstor.alipay.com/v1" "code.alipay.com/dbplatform/node-disk-controller/pkg/controller/manager/scheduler/filter" "code.alipay.com/dbplatform/node-disk-controller/pkg/controller/manager/state" + "code.alipay.com/dbplatform/node-disk-controller/pkg/util" "k8s.io/klog/v2" ) const ( minLocalStoragePct float64 = 20 - fourMiB int64 = 1 << 22 // ReasonLocalStorageTooLow = "LocalStorageTooLow" ) @@ -53,6 +53,6 @@ func GetAllocatableRemoveVolumeSize(node *state.Node, volSize int64) (result int } } - result = result / fourMiB * fourMiB + result = result / util.FourMiB * util.FourMiB return } diff --git a/pkg/api/volume.antstor.alipay.com/v1/snapshot_webhook.go b/pkg/api/volume.antstor.alipay.com/v1/snapshot_webhook.go index e2ef67f..fded0df 100644 --- a/pkg/api/volume.antstor.alipay.com/v1/snapshot_webhook.go +++ b/pkg/api/volume.antstor.alipay.com/v1/snapshot_webhook.go @@ -1,16 +1,13 @@ package v1 import ( + "code.alipay.com/dbplatform/node-disk-controller/pkg/util" "k8s.io/apimachinery/pkg/runtime" ctrl "sigs.k8s.io/controller-runtime" logf "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/webhook" ) -const ( - fourMiB int64 = 1 << 22 -) - // log is for logging in this package. var snaplog = logf.Log.WithName("snapshot-webhook") @@ -26,8 +23,8 @@ var _ webhook.Defaulter = &AntstorSnapshot{} // Default implements webhook.Defaulter so a webhook will be registered for the type func (r *AntstorSnapshot) Default() { - if remainder := r.Spec.Size % fourMiB; remainder > 0 { - r.Spec.Size = (r.Spec.Size / fourMiB) * fourMiB + if remainder := r.Spec.Size % util.FourMiB; remainder > 0 { + r.Spec.Size = (r.Spec.Size / util.FourMiB) * util.FourMiB snaplog.Info("defaulter", "name", r.Name, "set Size=", r.Spec.Size) } } diff --git a/pkg/controller/manager/reconciler/snapshot_reconciler.go b/pkg/controller/manager/reconciler/snapshot_reconciler.go index ee801b3..12f323e 100644 --- a/pkg/controller/manager/reconciler/snapshot_reconciler.go +++ b/pkg/controller/manager/reconciler/snapshot_reconciler.go @@ -8,6 +8,7 @@ import ( v1 "code.alipay.com/dbplatform/node-disk-controller/pkg/api/volume.antstor.alipay.com/v1" "code.alipay.com/dbplatform/node-disk-controller/pkg/generated/clientset/versioned" + "code.alipay.com/dbplatform/node-disk-controller/pkg/util" "github.com/go-logr/logr" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" @@ -24,8 +25,6 @@ const ( SnapshotCreateFailure = "SnapshotCreateFailure" SnapshotMergeFailure = "SnapshotMergeFailure" SnapshotDeleteFailure = "SnapshotDeleteFailure" - - fourMiB int64 = 1 << 22 ) type SnapshotReconciler struct { @@ -123,12 +122,12 @@ func (r *SnapshotReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c // TODO: validate Snapshot // 1. size align to 4MiB - if obj.Spec.Size < fourMiB { + if obj.Spec.Size < util.FourMiB { r.EventRecorder.Event(&obj, corev1.EventTypeWarning, SnapshotCreateFailure, "size too small, at least 4MiB") return ctrl.Result{}, nil } - if remainder := obj.Spec.Size % fourMiB; remainder > 0 { - obj.Spec.Size = (obj.Spec.Size / fourMiB) * fourMiB + if remainder := obj.Spec.Size % util.FourMiB; remainder > 0 { + obj.Spec.Size = (obj.Spec.Size / util.FourMiB) * util.FourMiB err = r.Update(context.Background(), &obj) return ctrl.Result{}, err } diff --git a/pkg/controller/manager/scheduler/sched_vol_group.go b/pkg/controller/manager/scheduler/sched_vol_group.go index 4d833cb..94da30a 100644 --- a/pkg/controller/manager/scheduler/sched_vol_group.go +++ b/pkg/controller/manager/scheduler/sched_vol_group.go @@ -2,11 +2,13 @@ package scheduler import ( "fmt" + "math" "sort" v1 "code.alipay.com/dbplatform/node-disk-controller/pkg/api/volume.antstor.alipay.com/v1" "code.alipay.com/dbplatform/node-disk-controller/pkg/controller/manager/scheduler/filter" "code.alipay.com/dbplatform/node-disk-controller/pkg/controller/manager/state" + "code.alipay.com/dbplatform/node-disk-controller/pkg/util" "code.alipay.com/dbplatform/node-disk-controller/pkg/util/misc" uuid "github.com/satori/go.uuid" "k8s.io/apimachinery/pkg/api/resource" @@ -14,8 +16,6 @@ import ( ) var ( - fourMiB int64 = 1 << 22 - ExtraPickSizeFnMap = make(map[string]GetAllocatableVolumeSizeFn) ) @@ -31,6 +31,7 @@ func (s *scheduler) ScheduleVolumeGroup(allNodes []*state.Node, volGroup *v1.Ant scheduledSize int64 needSched bool volGroupCopy = volGroup.DeepCopy() + qualified []*state.Node ) // check unscheduled @@ -49,7 +50,11 @@ func (s *scheduler) ScheduleVolumeGroup(allNodes []*state.Node, volGroup *v1.Ant defer s.lock.Unlock() // filter qualified nodes - qualified := s.filterNodes(allNodes, volGroupCopy) + qualified, err = s.filterNodes(allNodes, volGroupCopy) + if err != nil { + klog.Error(err) + return + } // sort nodes by free space, large -> small // node usage < empty threashold, set score to 0, last of the list @@ -57,6 +62,7 @@ func (s *scheduler) ScheduleVolumeGroup(allNodes []*state.Node, volGroup *v1.Ant err = schedVolGroup(qualified, volGroup) if err != nil { + klog.Error(err) return } @@ -93,9 +99,9 @@ func schedVolGroup(nodes []*state.Node, volGroup *v1.AntstorVolumeGroup) (err er // align to 4MiB bytes := int64(picked.AsApproximateFloat64()) - result = (bytes / fourMiB) * fourMiB - if bytes%fourMiB > 0 { - result += fourMiB + result = (bytes / util.FourMiB) * util.FourMiB + if bytes%util.FourMiB > 0 { + result += util.FourMiB } return } @@ -209,52 +215,42 @@ func schedVolGroup(nodes []*state.Node, volGroup *v1.AntstorVolumeGroup) (err er return } -func (s *scheduler) filterNodes(allNodes []*state.Node, volGroup *v1.AntstorVolumeGroup) (qualified []*state.Node) { +func (s *scheduler) filterNodes(allNodes []*state.Node, volGroup *v1.AntstorVolumeGroup) (qualified []*state.Node, err error) { var ( - minSize = volGroup.Spec.DesiredVolumeSpec.SizeRange.Min - maxRemoteVolCnt = s.cfg.Scheduler.MaxRemoteVolumeCount + minSize = volGroup.Spec.DesiredVolumeSpec.SizeRange.Min + // Here we build a fake AntstorVolume, which has minSize size and emtpy HostNode. + // Therefore the filter only checks pool status, pool affinity in Annotation, remote volume count, + // and SPDK condition because host node id is always different from target node id. + // filter will make sure that pool free size is larger than minSize + vol = &v1.AntstorVolume{ + ObjectMeta: volGroup.ObjectMeta, + Spec: v1.AntstorVolumeSpec{ + SizeByte: uint64(math.Round(minSize.AsApproximateFloat64())), + HostNode: &v1.NodeInfo{}, + PositionAdvice: v1.NoPreference, + }, + } ) // filter out unqualified nodes - for _, node := range allNodes { - // pool status - if !node.Pool.IsSchedulable() { - continue - } - - // node free space < min size - free := node.FreeResource.Storage() - if free.Cmp(minSize) < 0 { - continue - } - - // node spdk unhealthy - var spdkCond = v1.StatusError - for _, cond := range node.Pool.Status.Conditions { - if cond.Type == v1.PoolConditionSpkdHealth { - spdkCond = cond.Status + qualified, err = filter.NewFilterChain(s.cfg.Scheduler). + Filter(func(ctx *filter.FilterContext, node *state.Node, vol *v1.AntstorVolume) bool { + // filter empty node + if !volGroup.Spec.Stragety.AllowEmptyNode { + if len(node.Volumes) == 0 { + klog.Infof("[SchedFail] volGroup=%s Pool %s, Pool is empty", volGroup.Name, node.Pool.Name) + return false + } + // TODO: compare with volGroup.Spec.Stragety.EmptyThreasholdPct } - } - if spdkCond != v1.StatusOK { - continue - } + return true + }). + Input(allNodes, vol). + LoadFilterFromConfig(). + MatchAll() - // filter empty node - if !volGroup.Spec.Stragety.AllowEmptyNode { - freeFloat := free.AsApproximateFloat64() - total := node.Pool.GetVgTotalBytes() - // if node's real usage < EmptyThreasholdPct, the node is considered as empty - if (float64(total)-freeFloat)/float64(total)*100 <= float64(volGroup.Spec.Stragety.EmptyThreasholdPct) { - continue - } - } - - // remote volume count - if node.RemoteVolumesCount(s.cfg.Scheduler.RemoteIgnoreAnnoSelector)+1 >= maxRemoteVolCnt { - continue - } - - qualified = append(qualified, node) + if len(qualified) == 0 { + return } return diff --git a/pkg/csi/client/client.go b/pkg/csi/client/client.go index 37f8c08..84cf2cd 100644 --- a/pkg/csi/client/client.go +++ b/pkg/csi/client/client.go @@ -6,6 +6,7 @@ import ( v1 "code.alipay.com/dbplatform/node-disk-controller/pkg/api/volume.antstor.alipay.com/v1" "code.alipay.com/dbplatform/node-disk-controller/pkg/generated/clientset/versioned" + "code.alipay.com/dbplatform/node-disk-controller/pkg/util" uuid "github.com/satori/go.uuid" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" rest "k8s.io/client-go/rest" @@ -13,7 +14,7 @@ import ( ) var ( - fourMiB uint64 = 1 << 22 + fourMiB uint64 = uint64(util.FourMiB) _ AntstorClientIface = &KubeAPIClient{} ErrorNotFoundResource = fmt.Errorf("ResourceNotFound") diff --git a/pkg/util/const.go b/pkg/util/const.go index 51d003a..ca2ab32 100644 --- a/pkg/util/const.go +++ b/pkg/util/const.go @@ -21,3 +21,7 @@ const ( KubeConfigUserAgent = "obnvmf-node-disk/v0.0.1" KubeCfgUserAgentCSI = "obnvmf-csi/v0.0.1" ) + +const ( + FourMiB int64 = 1 << 22 +) From 1ab46264b16071895dab8ec193e2a7863586760f Mon Sep 17 00:00:00 2001 From: "shenmu.wy" Date: Tue, 5 Dec 2023 10:37:20 +0800 Subject: [PATCH 2/6] update status msg if scheduling volume group failed --- pkg/controller/manager/reconciler/volume_group_reconciler.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pkg/controller/manager/reconciler/volume_group_reconciler.go b/pkg/controller/manager/reconciler/volume_group_reconciler.go index 792ea2d..e01391c 100644 --- a/pkg/controller/manager/reconciler/volume_group_reconciler.go +++ b/pkg/controller/manager/reconciler/volume_group_reconciler.go @@ -331,6 +331,11 @@ func (r *AntstorVolumeGroupReconciler) scheduleVolGroup(ctx *plugin.Context, vol if err != nil { // TODO: update status log.Error(err, "sched volumegroup failed, retry in 1 min") + volGroup.Status.Message = err.Error() + updateErr := r.Status().Update(ctx.Ctx, volGroup) + if updateErr != nil { + log.Error(updateErr, err.Error()) + } return plugin.Result{Break: true, Result: ctrl.Result{RequeueAfter: time.Minute}} } From 527acdd7434a0134d81356c0a9809b3c55a1e90e Mon Sep 17 00:00:00 2001 From: "shenmu.wy" Date: Tue, 5 Dec 2023 11:18:02 +0800 Subject: [PATCH 3/6] new base image with nvme client --- hack/docker/Dockerfile.build_all | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hack/docker/Dockerfile.build_all b/hack/docker/Dockerfile.build_all index 5fd739b..109400e 100644 --- a/hack/docker/Dockerfile.build_all +++ b/hack/docker/Dockerfile.build_all @@ -25,7 +25,7 @@ ENV GOPROXY=${GOPROXY} RUN make controller RUN make scheduler -FROM reg.docker.alibaba-inc.com/dbplatform/debian:lvm-20230510 +FROM reg.docker.alibaba-inc.com/dbplatform/debian:lvm-20231205 ARG DBUILD_DATE ARG DBUILD_REPO_URL From b9384b31839bb6b8b0500b29e87fd3d4075b805c Mon Sep 17 00:00:00 2001 From: "shenmu.wy" Date: Tue, 5 Dec 2023 13:41:28 +0800 Subject: [PATCH 4/6] add base image dockerfile --- hack/docker/Dockerfile.base | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 hack/docker/Dockerfile.base diff --git a/hack/docker/Dockerfile.base b/hack/docker/Dockerfile.base new file mode 100644 index 0000000..322dc8e --- /dev/null +++ b/hack/docker/Dockerfile.base @@ -0,0 +1,15 @@ +FROM debian:bullseye-slim +LABEL maintainers="silentred" +LABEL description="debian bullseye-slim with lvm2, xfs, ext4, pcie, kmod, mount utils" + +RUN apt-get update && \ + # for CSI node + apt-get install -y util-linux e2fsprogs xfsprogs mount ca-certificates udev kmod nvme-cli && \ + # for disk-agent + apt-get install -y lvm2 pciutils && \ + rm -rf /var/lib/apt/lists/* + +RUN sed -i 's/use_lvmetad = 1/use_lvmetad = 0/' /etc/lvm/lvm.conf && \ + sed -i 's/use_lvmpolld = 1/use_lvmpolld = 0/' /etc/lvm/lvm.conf && \ + sed -i 's/udev_sync = 1/udev_sync = 0/' /etc/lvm/lvm.conf && \ + sed -i 's/udev_rules = 1/udev_rules = 0/' /etc/lvm/lvm.conf \ No newline at end of file From 77a64346eac63257cdbcab41d6fac0c76c500ca3 Mon Sep 17 00:00:00 2001 From: "shenmu.wy" Date: Tue, 5 Dec 2023 13:47:37 +0800 Subject: [PATCH 5/6] update base image --- hack/docker/Dockerfile.build_all | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hack/docker/Dockerfile.build_all b/hack/docker/Dockerfile.build_all index 109400e..5f565d7 100644 --- a/hack/docker/Dockerfile.build_all +++ b/hack/docker/Dockerfile.build_all @@ -25,7 +25,7 @@ ENV GOPROXY=${GOPROXY} RUN make controller RUN make scheduler -FROM reg.docker.alibaba-inc.com/dbplatform/debian:lvm-20231205 +FROM reg.docker.alibaba-inc.com/dbplatform/debian:lvm-20231206 ARG DBUILD_DATE ARG DBUILD_REPO_URL From aee2a4f28c319f74b984261084b6961addc217e1 Mon Sep 17 00:00:00 2001 From: "shenmu.wy" Date: Tue, 5 Dec 2023 14:00:57 +0800 Subject: [PATCH 6/6] update node-disk-controller image in deplyment --- hack/deploy/base/500-disk-operator.yaml | 2 +- hack/deploy/base/600-csi-controller.yaml | 2 +- hack/deploy/base/700-disk-agent.yaml | 2 +- hack/deploy/base/800-csi-node.yaml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/hack/deploy/base/500-disk-operator.yaml b/hack/deploy/base/500-disk-operator.yaml index 992b176..f996750 100644 --- a/hack/deploy/base/500-disk-operator.yaml +++ b/hack/deploy/base/500-disk-operator.yaml @@ -48,7 +48,7 @@ spec: secretName: selfsigned-cert-tls containers: - name: node-disk-controller - image: silentred/node-disk-controller:ba03b9e2-20231129185104 + image: silentred/node-disk-controller:77a64346-20231205134756 command: - /node-disk-controller args: diff --git a/hack/deploy/base/600-csi-controller.yaml b/hack/deploy/base/600-csi-controller.yaml index 448ff6a..5abf932 100644 --- a/hack/deploy/base/600-csi-controller.yaml +++ b/hack/deploy/base/600-csi-controller.yaml @@ -18,7 +18,7 @@ spec: serviceAccount: obnvmf-admin containers: - name: csi-antstor - image: silentred/node-disk-controller:ba03b9e2-20231129185104 + image: silentred/node-disk-controller:77a64346-20231205134756 command: - /node-disk-controller args: diff --git a/hack/deploy/base/700-disk-agent.yaml b/hack/deploy/base/700-disk-agent.yaml index 71757b9..5379aea 100644 --- a/hack/deploy/base/700-disk-agent.yaml +++ b/hack/deploy/base/700-disk-agent.yaml @@ -26,7 +26,7 @@ spec: capabilities: add: ["ALL"] allowPrivilegeEscalation: true - image: silentred/node-disk-controller:ba03b9e2-20231129185104 + image: silentred/node-disk-controller:77a64346-20231205134756 #imagePullPolicy: Always command: - /node-disk-controller diff --git a/hack/deploy/base/800-csi-node.yaml b/hack/deploy/base/800-csi-node.yaml index d275916..1a6c05b 100644 --- a/hack/deploy/base/800-csi-node.yaml +++ b/hack/deploy/base/800-csi-node.yaml @@ -26,7 +26,7 @@ spec: capabilities: add: ["CAP_MKNOD", "CAP_SYS_ADMIN", "SYS_ADMIN", "SYS_RAWIO"] allowPrivilegeEscalation: true - image: silentred/node-disk-controller:ba03b9e2-20231129185104 + image: silentred/node-disk-controller:77a64346-20231205134756 command: - /node-disk-controller args: