From 72ee4ca11182781b274c2769d3160d74fc608b9f Mon Sep 17 00:00:00 2001 From: ChrisLiu <70144550+chrisliu1995@users.noreply.github.com> Date: Wed, 6 Sep 2023 20:38:33 +0800 Subject: [PATCH] feat: support auto scaling-up based on minAvailable (#88) Signed-off-by: ChrisLiu --- docs/en/user_manuals/autoscale.md | 112 ++++++++++++++++- ...52\345\212\250\344\274\270\347\274\251.md" | 113 +++++++++++++++++- pkg/externalscaler/externalscaler.go | 42 ++++++- 3 files changed, 259 insertions(+), 8 deletions(-) diff --git a/docs/en/user_manuals/autoscale.md b/docs/en/user_manuals/autoscale.md index 7e969a10..f35cd155 100644 --- a/docs/en/user_manuals/autoscale.md +++ b/docs/en/user_manuals/autoscale.md @@ -1,5 +1,7 @@ ## Feature overview +### Auto Scaling-down + Compared to stateless service types, game servers have higher requirements for automatic scaling, especially in terms of scaling down. The differences between game servers become more and more obvious over time, and the precision requirements for scaling down are extremely high. Coarse-grained scaling mechanisms can easily cause negative effects such as player disconnections, resulting in huge losses for the business. @@ -59,7 +61,7 @@ spec: periodSeconds: 15 triggers: - type: external - metricType: Value + metricType: AverageValue metadata: scalerAddress: kruise-game-external-scaler.kruise-game-system:6000 @@ -96,4 +98,110 @@ NAME STATE OPSSTATE DP UP minecraft-1 Ready None 0 0 minecraft-2 Ready None 0 0 -``` \ No newline at end of file +``` + +### Auto Scaling-up + +In addition to setting the automatic scaling policy, you can also set the automatic scaling policy. + +#### Scaling with resource metrics or custom metrics + +Native Kubernetes supports auto scaling-up using CPU utilization, and its complete yaml is as follows: + +```yaml +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: minecraft # Fill in the name of the corresponding GameServerSet +spec: + scaleTargetRef: + name: minecraft # Fill in the name of the corresponding GameServerSet + apiVersion: game.kruise.io/v1alpha1 + kind: GameServerSet + pollingInterval: 30 + minReplicaCount: 0 + advanced: + horizontalPodAutoscalerConfig: + behavior: # Inherit from HPA behavior, refer to https://kubernetes.io/zh-cn/docs/tasks/run-application/horizontal-pod-autoscale/#configurable-scaling-behavior + scaleDown: + stabilizationWindowSeconds: 45 # Set the scaling-down stabilization window time to 45 seconds + policies: + - type: Percent + value: 100 + periodSeconds: 15 + triggers: + - type: external + metricType: AverageValue + metadata: + scalerAddress: kruise-game-external-scaler.kruise-game-system:6000 + - type: cpu + metricType: Utilization # Allowed types are 'Utilization' or 'AverageValue' + metadata: + value: "50" +``` + +Pressure testing of the gameserver, you can see that the gameserver began to scale-up + +```bash +kubectl get gss +NAME DESIRED CURRENT UPDATED READY MAINTAINING WAITTOBEDELETED AGE +minecraft 5 5 5 0 0 0 7s + +# After a while + +kubectl get gss +NAME DESIRED CURRENT UPDATED READY MAINTAINING WAITTOBEDELETED AGE +minecraft 20 20 20 20 0 0 137s +``` + +#### Set the minimum number of game servers whose opsState is None + +OKG supports setting the minimum number of game servers. When the current number of game servers whose opsState is None is less than the set value, OKG will automatically expand new game servers so that the number of game servers whose opsState is None meets the set minimum number. + +The configuration method is as follows. In this example, the minimum number of game servers with opsState set to None is 3: + +```yaml +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: minecraft # Fill in the name of the corresponding GameServerSet +spec: + scaleTargetRef: + name: minecraft # Fill in the name of the corresponding GameServerSet + apiVersion: game.kruise.io/v1alpha1 + kind: GameServerSet + pollingInterval: 30 + minReplicaCount: 0 + advanced: + horizontalPodAutoscalerConfig: + behavior: # Inherit from HPA behavior, refer to https://kubernetes.io/zh-cn/docs/tasks/run-application/horizontal-pod-autoscale/#configurable-scaling-behavior + scaleDown: + stabilizationWindowSeconds: 45 # Set the scaling-down stabilization window time to 45 seconds + policies: + - type: Percent + value: 100 + periodSeconds: 15 + triggers: + - type: external + metricType: AverageValue + metadata: + minAvailable: "3" # 设置opsState为None的游戏服的最小个数 + scalerAddress: kruise-game-external-scaler.kruise-game-system:6000 +``` + +First apply a GameServerSet with 1 replicas, after the KEDA detection cycle, immediately scale up two new game servers. At this time, the number of game servers whose opsState is None is not less than the minAvailable value, and scale-up process is completed. + +```bash +kubectl get gs +NAME STATE OPSSTATE DP UP AGE +minecraft-0 Ready None 0 0 7s + +# After a while + +kubectl get gs +NAME STATE OPSSTATE DP UP AGE +minecraft-0 Ready None 0 0 20s +minecraft-1 Ready None 0 0 5s +minecraft-2 Ready None 0 0 5s +``` + diff --git "a/docs/\344\270\255\346\226\207/\347\224\250\346\210\267\346\211\213\345\206\214/\350\207\252\345\212\250\344\274\270\347\274\251.md" "b/docs/\344\270\255\346\226\207/\347\224\250\346\210\267\346\211\213\345\206\214/\350\207\252\345\212\250\344\274\270\347\274\251.md" index 8beef924..ff700a4a 100644 --- "a/docs/\344\270\255\346\226\207/\347\224\250\346\210\267\346\211\213\345\206\214/\350\207\252\345\212\250\344\274\270\347\274\251.md" +++ "b/docs/\344\270\255\346\226\207/\347\224\250\346\210\267\346\211\213\345\206\214/\350\207\252\345\212\250\344\274\270\347\274\251.md" @@ -1,5 +1,7 @@ ## 功能概览 +### 自动缩容 + 游戏服与无状态业务类型不同,对于自动伸缩特性有着更高的要求,其要求主要体现在缩容方面。 由于游戏为强有状态业务,随着时间的推移,游戏服之间的差异性愈加明显,缩容的精确度要求极高,粗糙的缩容机制容易造成玩家断线等负面影响,给业务造成巨大损失。 @@ -29,7 +31,7 @@ OKG 的自动伸缩机制如下所示 如此一来,OKG的自动伸缩器在缩容窗口期内只会删除处于WaitToBeDeleted状态的游戏服,真正做到定向缩容、精准缩容。 -## 使用示例 +**使用示例如下:** _**前置条件:在集群中安装 [KEDA](https://keda.sh/docs/2.10/deploy/)**_ @@ -58,7 +60,7 @@ spec: periodSeconds: 15 triggers: - type: external - metricType: Value + metricType: AverageValue metadata: scalerAddress: kruise-game-external-scaler.kruise-game-system:6000 ``` @@ -91,4 +93,109 @@ kubectl get gs NAME STATE OPSSTATE DP UP minecraft-1 Ready None 0 0 minecraft-2 Ready None 0 0 -``` \ No newline at end of file +``` + +### 自动扩容 + +除了设置自动缩容策略,也可以设置自动扩容策略。 + +#### 利用资源指标或自定义指标进行扩容 + +例如,原生Kubernetes支持使用CPU利用率进行扩容,其完整的yaml如下 + +```yaml +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: minecraft #填写对应GameServerSet的名称 +spec: + scaleTargetRef: + name: minecraft #填写对应GameServerSet的名称 + apiVersion: game.kruise.io/v1alpha1 + kind: GameServerSet + pollingInterval: 30 + minReplicaCount: 0 + advanced: + horizontalPodAutoscalerConfig: + behavior: #继承HPA策略,可参考文档 https://kubernetes.io/zh-cn/docs/tasks/run-application/horizontal-pod-autoscale/#configurable-scaling-behavior + scaleDown: + stabilizationWindowSeconds: 45 #设置缩容稳定窗口时间为45秒 + policies: + - type: Percent + value: 100 + periodSeconds: 15 + triggers: + - type: external + metricType: AverageValue + metadata: + scalerAddress: kruise-game-external-scaler.kruise-game-system:6000 + - type: cpu + metricType: Utilization # 允许的类型是 "利用率 "或 "平均值" + metadata: + value: "50" +``` + +对游戏服进行压测,可以看到游戏服开始扩容 + +```bash +kubectl get gss +NAME DESIRED CURRENT UPDATED READY MAINTAINING WAITTOBEDELETED AGE +minecraft 5 5 5 0 0 0 7s + +# After a while + +kubectl get gss +NAME DESIRED CURRENT UPDATED READY MAINTAINING WAITTOBEDELETED AGE +minecraft 20 20 20 20 0 0 137s +``` + +#### 设置opsState为None的游戏服的最小个数 + +OKG支持设置游戏服最小数目。在当前所有opsState为None的游戏服数量少于设置的值时,OKG将自动扩容出新的游戏服,使opsState为None的游戏服数量满足设置的最小个数。 + +配置方式如下,在此例中设置opsState为None的游戏服的最小个数为3: + +```yaml +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: minecraft #填写对应GameServerSet的名称 +spec: + scaleTargetRef: + name: minecraft #填写对应GameServerSet的名称 + apiVersion: game.kruise.io/v1alpha1 + kind: GameServerSet + pollingInterval: 30 + minReplicaCount: 0 + advanced: + horizontalPodAutoscalerConfig: + behavior: #继承HPA策略,可参考文档 https://kubernetes.io/zh-cn/docs/tasks/run-application/horizontal-pod-autoscale/#configurable-scaling-behavior + scaleDown: + stabilizationWindowSeconds: 45 #设置缩容稳定窗口时间为45秒 + policies: + - type: Percent + value: 100 + periodSeconds: 15 + triggers: + - type: external + metricType: AverageValue + metadata: + minAvailable: "3" # 设置opsState为None的游戏服的最小个数 + scalerAddress: kruise-game-external-scaler.kruise-game-system:6000 +``` + +初始部署replicas为1的GameServerSet,经过KEDA探测周期后,马上扩容出两个新的游戏服。此时opsState为None的游戏服数量不小于设置的minAvailable值,完成了自动扩容。 + +```bash +kubectl get gs +NAME STATE OPSSTATE DP UP AGE +minecraft-0 Ready None 0 0 7s + +# After a while + +kubectl get gs +NAME STATE OPSSTATE DP UP AGE +minecraft-0 Ready None 0 0 20s +minecraft-1 Ready None 0 0 5s +minecraft-2 Ready None 0 0 5s +``` diff --git a/pkg/externalscaler/externalscaler.go b/pkg/externalscaler/externalscaler.go index 1749f878..f7d927c4 100644 --- a/pkg/externalscaler/externalscaler.go +++ b/pkg/externalscaler/externalscaler.go @@ -10,6 +10,11 @@ import ( "k8s.io/apimachinery/pkg/types" "k8s.io/klog/v2" "sigs.k8s.io/controller-runtime/pkg/client" + "strconv" +) + +const ( + NoneGameServerMinNumberKey = "minAvailable" ) type ExternalScaler struct { @@ -62,11 +67,42 @@ func (e *ExternalScaler) GetMetrics(ctx context.Context, metricRequest *GetMetri return nil, err } - isWaitToDelete, _ := labels.NewRequirement(gamekruiseiov1alpha1.GameServerOpsStateKey, selection.Equals, []string{string(gamekruiseiov1alpha1.WaitToDelete)}) - notDeleting, _ := labels.NewRequirement(gamekruiseiov1alpha1.GameServerStateKey, selection.NotEquals, []string{string(gamekruiseiov1alpha1.Deleting)}) + // scale up when number of GameServers with None opsState less than minAvailable defined by user isGssOwner, _ := labels.NewRequirement(gamekruiseiov1alpha1.GameServerOwnerGssKey, selection.Equals, []string{name}) - + isNone, _ := labels.NewRequirement(gamekruiseiov1alpha1.GameServerOpsStateKey, selection.Equals, []string{string(gamekruiseiov1alpha1.None)}) podList := &corev1.PodList{} + err = e.client.List(ctx, podList, &client.ListOptions{ + Namespace: ns, + LabelSelector: labels.NewSelector().Add( + *isNone, + *isGssOwner, + ), + }) + if err != nil { + klog.Error(err) + return nil, err + } + + noneNum := len(podList.Items) + minNum, err := strconv.ParseInt(metricRequest.ScaledObjectRef.GetScalerMetadata()[NoneGameServerMinNumberKey], 10, 32) + if err != nil { + klog.Errorf("minAvailable should be integer type, err: %s", err.Error()) + } + if err == nil && noneNum < int(minNum) { + desireReplicas := *gss.Spec.Replicas + int32(minNum) - int32(noneNum) + klog.Infof("GameServerSet %s/%s desire replicas is %d", ns, name, desireReplicas) + return &GetMetricsResponse{ + MetricValues: []*MetricValue{{ + MetricName: "gssReplicas", + MetricValue: int64(desireReplicas), + }}, + }, nil + } + + // scale up those GameServers with WaitToBeDeleted opsState + isWaitToDelete, _ := labels.NewRequirement(gamekruiseiov1alpha1.GameServerOpsStateKey, selection.Equals, []string{string(gamekruiseiov1alpha1.WaitToDelete)}) + notDeleting, _ := labels.NewRequirement(gamekruiseiov1alpha1.GameServerStateKey, selection.NotEquals, []string{string(gamekruiseiov1alpha1.Deleting)}) + podList = &corev1.PodList{} err = e.client.List(ctx, podList, &client.ListOptions{ Namespace: ns, LabelSelector: labels.NewSelector().Add(