Skip to content

Commit

Permalink
pv minimum check logic, lable, alert added
Browse files Browse the repository at this point in the history
Signed-off-by: Denis Shipkov <[email protected]>
  • Loading branch information
grem-li committed Nov 19, 2024
1 parent 6085cc7 commit 2c3970a
Show file tree
Hide file tree
Showing 8 changed files with 116 additions and 766 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ const (
linstorResourcesWatcherCtrlName = "linstor-resources-watcher-controller"
missMatchedLabel = "storage.deckhouse.io/linstor-settings-mismatch"
unableToSetQuorumMinimumRedundancyLabel = "storage.deckhouse.io/unable-to-set-quorum-minimum-redundancy"
pvNotEnoughReplicasLabel = "storage.deckhouse.io/pv-not-enough-replicas"
PVCSIDriver = "replicated.csi.storage.deckhouse.io"
replicasOnSameRGKey = "replicas_on_same"
replicasOnDifferentRGKey = "replicas_on_different"
Expand Down Expand Up @@ -112,8 +113,23 @@ func NewLinstorResourcesWatcher(
rgMap[rg.Name] = rg
}

ReconcileParams(ctx, log, cl, lc, scMap, rdMap, rgMap)
ReconcileTieBreaker(ctx, log, lc, rdMap, rgMap)
pvsList, err := GetListPV(ctx, cl)
if err != nil {
log.Error(err, "[NewLinstorResourcesWatcher] unable to get Persistent Volumes")
}

resMap := make(map[string][]lapi.Resource, len(rdMap))
for name := range rdMap {
res, err := lc.Resources.GetAll(ctx, name)
if err != nil {
log.Error(err, fmt.Sprintf("[NewLinstorResourcesWatcher] unable to get Linstor Resources, name: %s", name))
}
resMap[name] = res
}

ReconcileParams(ctx, log, cl, lc, scMap, rdMap, rgMap, pvsList)
ReconcileTieBreaker(ctx, log, lc, rdMap, rgMap, resMap)
ReconcilePVReplicas(ctx, log, cl, lc, rdMap, rgMap, resMap, pvsList)

log.Info("[NewLinstorResourcesWatcher] ends reconcile")
}
Expand All @@ -128,12 +144,9 @@ func ReconcileParams(
scs map[string]v1.StorageClass,
rds map[string]lapi.ResourceDefinitionWithVolumeDefinition,
rgs map[string]lapi.ResourceGroup,
pvs []core.PersistentVolume,
) {
log.Info("[ReconcileParams] starts work")
pvs, err := GetListPV(ctx, cl)
if err != nil {
log.Error(err, "[ReconcileParams] unable to get Persistent Volumes")
}

for _, pv := range pvs {
if pv.Spec.CSI != nil && pv.Spec.CSI.Driver == PVCSIDriver {
Expand Down Expand Up @@ -206,30 +219,94 @@ func ReconcileParams(
log.Info("[ReconcileParams] ends work")
}

func ReconcileTieBreaker(
func ReconcilePVReplicas(
ctx context.Context,
log logger.Logger,
cl client.Client,
lc *lapi.Client,
rds map[string]lapi.ResourceDefinitionWithVolumeDefinition,
rgs map[string]lapi.ResourceGroup,
res map[string][]lapi.Resource,
pvs []core.PersistentVolume,
) {
log.Info("[ReconcileTieBreaker] starts work")
log.Info("[ReconcilePVReplicas] starts work")

allResources := make(map[string][]lapi.Resource, len(rds)*3)
for name := range rds {
res, err := lc.Resources.GetAll(ctx, name)
for _, pv := range pvs {
if pv.Spec.CSI != nil && pv.Spec.CSI.Driver == PVCSIDriver {
RGName := rds[pv.Name].ResourceGroupName
rg := rgs[RGName]
log.Debug(fmt.Sprintf("[ReconcilePVReplicas] PV: %s, RG: %s", pv.Name, rg.Name))

enoughReplicas := checkPVMinReplicasCount(ctx, log, lc, rg, res[pv.Name])

if pv.Labels == nil {
pv.Labels = make(map[string]string)
}

origLabelVal, exists := pv.Labels[pvNotEnoughReplicasLabel]
log.Debug(fmt.Sprintf("[ReconcilePVReplicas] Update label \"%s\", old: \"%s\", new: \"%t\"", pvNotEnoughReplicasLabel, origLabelVal, !enoughReplicas))

upd := false
if !enoughReplicas && (!exists || origLabelVal != "true") {
pv.Labels[pvNotEnoughReplicasLabel] = "true"
upd = true
}
if enoughReplicas && exists {
delete(pv.Labels, pvNotEnoughReplicasLabel)
upd = true
}

if upd {
err := UpdatePV(ctx, cl, &pv)
if err != nil {
log.Error(err, fmt.Sprintf("[ReconcilePVReplicas] unable to update the PV, name: %s", pv.Name))
}
}
}
}

log.Info("[ReconcilePVReplicas] ends work")
}

func checkPVMinReplicasCount(ctx context.Context, log logger.Logger, lc *lapi.Client, rg lapi.ResourceGroup, resList []lapi.Resource) bool {
placeCount := int(rg.SelectFilter.PlaceCount)
upVols := 0

if placeCount <= 0 {
return true
}

for _, r := range resList {
volList, err := lc.Resources.GetVolumes(ctx, r.Name, r.NodeName)
if err != nil {
log.Error(err, fmt.Sprintf("[ReconcileTieBreaker] unable to get Linstor Resources by the Resource Definition, name: %s", name))
log.Error(err, fmt.Sprintf("[checkPVMinReplicasCount] unable to get Linstor Resources Volumes, name: %s, node: %s", r.Name, r.NodeName))
}

allResources[name] = res
for _, v := range volList {
if v.State.DiskState == "UpToDate" {
upVols += 1
}
}
}

return upVols >= placeCount
}

func ReconcileTieBreaker(
ctx context.Context,
log logger.Logger,
lc *lapi.Client,
rds map[string]lapi.ResourceDefinitionWithVolumeDefinition,
rgs map[string]lapi.ResourceGroup,
res map[string][]lapi.Resource,
) {
log.Info("[ReconcileTieBreaker] starts work")

var (
nodes []lapi.Node
err error
)
for name, resources := range allResources {
for name, resources := range res {
if len(resources) == 0 {
log.Warning(fmt.Sprintf("[ReconcileTieBreaker] no actual Linstor Resources for the Resource Definition, name: %s", name))
continue
Expand Down Expand Up @@ -300,7 +377,9 @@ func getNodeForTieBreaker(
for _, node := range unusedNodes {
log.Trace(fmt.Sprintf("[getNodeForTieBreaker] resource %s does not use a node %s", resources[0].Name, node.Name))
}
rg := getResourceGroupByResource(resources[0].Name, rds, rgs)

RGName := rds[resources[0].Name].ResourceGroupName
rg := rgs[RGName]

if key, exist := rg.Props[replicasOnSameRGKey]; exist {
unusedNodes = filterNodesByReplicasOnSame(unusedNodes, key)
Expand Down Expand Up @@ -389,10 +468,6 @@ func filterNodesByReplicasOnSame(nodes []lapi.Node, key string) []lapi.Node {
return filtered
}

func getResourceGroupByResource(resourceName string, rds map[string]lapi.ResourceDefinitionWithVolumeDefinition, rgs map[string]lapi.ResourceGroup) lapi.ResourceGroup {
return rgs[rds[resourceName].ResourceGroupName]
}

func filterOutUsedNodes(nodes []lapi.Node, resources []lapi.Resource) []lapi.Node {
unusedNodes := make([]lapi.Node, 0, len(nodes))
resNodes := make(map[string]struct{}, len(resources))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@
plk_grouped_by__d8_drbd_device_health: "ReplicatedPVSettingsCheck,tier=~tier,prometheus=deckhouse,kubernetes=~kubernetes"
summary: Replicated PVs has incorrect settings
description: |
There are persistent volumes in the cluster that were created before migration to ReplicatedStorageClass.
There are persistent volumes in the cluster that were created before migration to ReplicatedStorageClass.
You can recreate it, or add the label storage.deckhouse.io/linstor-settings-mismatch-ignore!=true to ignore it for the PV.
Please note that in the future, when transitioning from LINSTOR to a new controller, the settings for all such PVs will be automatically modified to match the current StorageClass settings.
You can view all of such PV with command
You can view all of such PV with command
`kubectl get pv -l storage.deckhouse.io/linstor-settings-mismatch=true,storage.deckhouse.io/linstor-settings-mismatch-ignore!=true`
Also, you can add label for all incorrect PVs
Expand All @@ -38,3 +38,23 @@
There are persistent volumes in the cluster that has incorrect quorum-minimum-redundancy setting.
Please, contact tech support for assistance.
- alert: ReplicatedPVWithIncorrectReplicasCount
expr: count(kube_persistentvolume_labels{label_storage_deckhouse_io_pv_not_enough_replicas="true"}) > 0
for: 5m
labels:
severity_level: "3"
tier: cluster
annotations:
plk_markup_format: "markdown"
plk_protocol_version: "1"
plk_create_group_if_not_exists__d8_drbd_device_health: "ReplicatedPVSettingsCheck,tier=~tier,prometheus=deckhouse,kubernetes=~kubernetes"
plk_grouped_by__d8_drbd_device_health: "ReplicatedPVSettingsCheck,tier=~tier,prometheus=deckhouse,kubernetes=~kubernetes"
summary: Replicated PVs has not enough replicas
description: |
There are persistent volumes in the cluster that has not enough replicas (set of UpToDate resources less than minimal count)
You can get minimal limit for StorageClass with command
`kubectl get sc -o yaml | grep -E "(\sname|/placementCount)"`
And view all Resource States with
`linstor r l`
98 changes: 0 additions & 98 deletions test/config/config.go

This file was deleted.

78 changes: 0 additions & 78 deletions test/go.mod

This file was deleted.

Loading

0 comments on commit 2c3970a

Please sign in to comment.