From 7b93963d0e9ab5b202848911353685899cc7bf8b Mon Sep 17 00:00:00 2001 From: Moshe Immermam Date: Mon, 18 Nov 2024 10:34:20 +0200 Subject: [PATCH] fix: replicaset health --- pkg/health/health_replicaset.go | 77 ++++------------ pkg/health/health_test.go | 4 +- .../Kubernetes/ReplicaSet/unknown.yaml | 89 +++++++++++++++++++ 3 files changed, 109 insertions(+), 61 deletions(-) create mode 100644 pkg/health/testdata/Kubernetes/ReplicaSet/unknown.yaml diff --git a/pkg/health/health_replicaset.go b/pkg/health/health_replicaset.go index 8a3df02..391e7ff 100644 --- a/pkg/health/health_replicaset.go +++ b/pkg/health/health_replicaset.go @@ -2,7 +2,6 @@ package health import ( "fmt" - "time" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" @@ -18,79 +17,39 @@ func getReplicaSetHealth(obj *unstructured.Unstructured) (*HealthStatus, error) if err != nil { return nil, err } - return getAppsv1ReplicaSetHealth(&replicaSet) + return getAppsv1ReplicaSetHealth(&replicaSet, obj) default: return nil, fmt.Errorf("unsupported ReplicaSet GVK: %s", gvk) } } -func getAppsv1ReplicaSetHealth(rs *appsv1.ReplicaSet) (*HealthStatus, error) { +func getAppsv1ReplicaSetHealth(rs *appsv1.ReplicaSet, obj *unstructured.Unstructured) (*HealthStatus, error) { replicas := int32(0) if rs.Spec.Replicas != nil { replicas = *rs.Spec.Replicas } - startDeadline := GetStartDeadline(rs.Spec.Template.Spec.Containers...) - age := time.Since(rs.CreationTimestamp.Time).Truncate(time.Minute).Abs() - health := HealthHealthy - if rs.Status.ReadyReplicas == 0 { - if rs.Status.Replicas > 0 && age < startDeadline { - health = HealthUnknown - } else { - health = HealthUnhealthy - } - } else if rs.Status.ReadyReplicas < replicas { - health = HealthWarning - } else if rs.Status.ReadyReplicas >= replicas { - health = HealthHealthy - } + hr := getReplicaHealth(ReplicaStatus{ + Object: obj, + Containers: rs.Spec.Template.Spec.Containers, + Desired: int(replicas), + Replicas: int(rs.Status.Replicas), + Ready: int(rs.Status.ReadyReplicas), + Updated: int(rs.Status.FullyLabeledReplicas), + }) - if replicas == 0 && rs.Status.Replicas == 0 { - return &HealthStatus{ - Ready: true, - Status: HealthStatusScaledToZero, - Health: health, - }, nil - } - - if rs.Generation == rs.Status.ObservedGeneration && - rs.Status.ReadyReplicas == *rs.Spec.Replicas { - return &HealthStatus{ - Health: health, - Status: HealthStatusRunning, - Ready: true, - }, nil + if rs.Generation != rs.Status.ObservedGeneration { + hr.Status = HealthStatusUpdating + hr.Ready = false } failCondition := getAppsv1ReplicaSetCondition(rs.Status, appsv1.ReplicaSetReplicaFailure) - if failCondition != nil && failCondition.Status == corev1.ConditionTrue { - return &HealthStatus{ - Health: health, - Status: HealthStatusError, - Message: failCondition.Message, - }, nil + if hr.Health != HealthUnhealthy && failCondition != nil && failCondition.Status == corev1.ConditionTrue { + hr.Ready = true + hr.Health = HealthUnhealthy + hr.Message = failCondition.Message } - - if rs.Status.ReadyReplicas < *rs.Spec.Replicas { - return &HealthStatus{ - Health: health, - Status: HealthStatusScalingUp, - Message: fmt.Sprintf("%d of %d pods ready", rs.Status.ReadyReplicas, *rs.Spec.Replicas), - }, nil - } - - if rs.Status.ReadyReplicas > *rs.Spec.Replicas { - return &HealthStatus{ - Health: health, - Status: HealthStatusScalingDown, - Message: fmt.Sprintf("%d pods terminating", rs.Status.ReadyReplicas-*rs.Spec.Replicas), - }, nil - } - - return &HealthStatus{ - Status: HealthStatusUnknown, - Health: health, - }, nil + return hr, nil } func getAppsv1ReplicaSetCondition( diff --git a/pkg/health/health_test.go b/pkg/health/health_test.go index 798ab17..f72f547 100644 --- a/pkg/health/health_test.go +++ b/pkg/health/health_test.go @@ -545,11 +545,11 @@ func TestHPA(t *testing.T) { func TestReplicaSet(t *testing.T) { assertAppHealthWithOverwrite(t, "./testdata/replicaset-ittools.yml", map[string]string{ "2024-08-03T06:06:18Z": time.Now().Add(-time.Minute * 2).UTC().Format("2006-01-02T15:04:05Z"), - }, health.HealthStatusRunning, health.HealthHealthy, true) + }, health.HealthStatusRunning, health.HealthHealthy, false) assertAppHealthWithOverwrite(t, "./testdata/replicaset-unhealthy-pods.yaml", map[string]string{ "2024-10-21T11:20:19Z": time.Now().Add(-time.Minute * 2).UTC().Format("2006-01-02T15:04:05Z"), - }, health.HealthStatusScalingUp, health.HealthUnknown, false) + }, health.HealthStatusStarting, health.HealthUnknown, false) } func TestPod(t *testing.T) { diff --git a/pkg/health/testdata/Kubernetes/ReplicaSet/unknown.yaml b/pkg/health/testdata/Kubernetes/ReplicaSet/unknown.yaml new file mode 100644 index 0000000..154274a --- /dev/null +++ b/pkg/health/testdata/Kubernetes/ReplicaSet/unknown.yaml @@ -0,0 +1,89 @@ +apiVersion: apps/v1 +kind: ReplicaSet +metadata: + uid: 5164138b-dc4a-4192-9107-1e6341eafc29 + name: incident-manager-ui-66cfd695c + labels: + pod-template-hash: 66cfd695c + app.kubernetes.io/name: incident-manager-ui + app.kubernetes.io/instance: mission-control + namespace: mission-control + annotations: + expected-status: Scaled to Zero + expected-ready: "true" + meta.helm.sh/release-name: mission-control + meta.helm.sh/release-namespace: mission-control + deployment.kubernetes.io/revision: "109" + deployment.kubernetes.io/max-replicas: "2" + deployment.kubernetes.io/desired-replicas: "1" + ownerReferences: + - uid: f40af5c4-d2d3-4478-8a97-a6125083dfcf + kind: Deployment + name: incident-manager-ui + apiVersion: apps/v1 + controller: true + blockOwnerDeletion: true + creationTimestamp: 2024-11-11T19:03:49Z +spec: + replicas: 0 + selector: + matchLabels: + pod-template-hash: 66cfd695c + app.kubernetes.io/name: incident-manager-ui + app.kubernetes.io/instance: mission-control + template: + spec: + dnsPolicy: ClusterFirst + containers: + - env: + - name: HOSTNAME + value: 0.0.0.0 + - name: ORY_KRATOS_URL + value: https://incident-commander.demo.aws.flanksource.com/api/.ory + - name: BACKEND_URL + value: http://mission-control:8080 + name: flanksource-ui + image: public.ecr.aws/flanksource/incident-manager-ui:v1.0.822 + ports: + - name: http + protocol: TCP + containerPort: 3000 + resources: + limits: + memory: 2Gi + requests: + cpu: 200m + memory: 200Mi + livenessProbe: + httpGet: + path: /api/_health + port: http + scheme: HTTP + periodSeconds: 10 + timeoutSeconds: 1 + failureThreshold: 3 + successThreshold: 1 + readinessProbe: + httpGet: + path: /api/_health + port: http + scheme: HTTP + periodSeconds: 10 + timeoutSeconds: 1 + failureThreshold: 3 + successThreshold: 1 + imagePullPolicy: IfNotPresent + securityContext: {} + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + restartPolicy: Always + schedulerName: default-scheduler + securityContext: {} + terminationGracePeriodSeconds: 30 + metadata: + labels: + pod-template-hash: 66cfd695c + app.kubernetes.io/name: incident-manager-ui + app.kubernetes.io/instance: mission-control +status: + replicas: 0