From 5939f38ca06b17215aa4e12692cec92be81184ef Mon Sep 17 00:00:00 2001 From: Moshe Immermam Date: Fri, 8 Nov 2024 12:27:22 +0200 Subject: [PATCH] chore: improve statefulset health --- pkg/health/health.go | 38 +++- pkg/health/health_pod.go | 2 +- pkg/health/health_statefulset.go | 59 +++--- pkg/health/health_test.go | 98 ++++++++- pkg/health/testdata/statefulset-starting.yaml | 197 ++++++++++++++++++ pkg/health/testdata/statefulset.yaml | 1 + pkg/health/utils.go | 36 ++++ pkg/lua/lua.go | 6 +- 8 files changed, 387 insertions(+), 50 deletions(-) create mode 100644 pkg/health/testdata/statefulset-starting.yaml diff --git a/pkg/health/health.go b/pkg/health/health.go index f4046ce..e7e1b5c 100644 --- a/pkg/health/health.go +++ b/pkg/health/health.go @@ -5,10 +5,14 @@ import ( "strings" "time" + "github.com/samber/lo" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/util/duration" ) +var DefaultOverrides HealthOverride + type Health string const ( @@ -40,6 +44,7 @@ const ( HealthStatusEvicted HealthStatusCode = "Evicted" HealthStatusCompleted HealthStatusCode = "Completed" HealthStatusCrashLoopBackoff HealthStatusCode = "CrashLoopBackOff" + HealthStatusCrashed HealthStatusCode = "Crashed" HealthStatusCreating HealthStatusCode = "Creating" HealthStatusDeleted HealthStatusCode = "Deleted" HealthStatusDeleting HealthStatusCode = "Deleting" @@ -100,12 +105,37 @@ func IsWorse(current, new HealthStatusCode) bool { return newIndex > currentIndex } -func GetHealthByConfigType(configType string, obj map[string]any) HealthStatus { - if strings.HasPrefix(configType, "Mongo::") { +func GetHealthByConfigType(configType string, obj map[string]any, states ...string) HealthStatus { + switch configType { + case "AWS::ECS::Task": + return GetECSTaskHealth(obj) + } + + configClass := strings.Split(configType, "::")[0] + + switch strings.ToLower(configClass) { + case "mongo": return GetMongoHealth(obj) + case "kubernetes", "crossplane", "missioncontrol", "flux", "argo": + hr, err := + GetResourceHealth(&unstructured.Unstructured{Object: obj}, DefaultOverrides) + if hr != nil { + return *hr + } + if err != nil { + return HealthStatus{ + Status: "HealthParseError", + Message: lo.Elipse(err.Error(), 500), + } + } } - return HealthStatus{} + if len(states) > 0 { + return GetHealthFromStatusName(states[0]) + } + return HealthStatus{ + Health: HealthUnknown, + } } // GetResourceHealth returns the health of a k8s resource @@ -119,7 +149,7 @@ func GetResourceHealth( return &HealthStatus{ Status: "TerminatingStalled", Health: HealthUnhealthy, - Message: fmt.Sprintf("Resource is terminating, time since deletion: %v", terminatingFor), + Message: fmt.Sprintf("terminating for %v", duration.ShortHumanDuration(terminatingFor.Truncate(time.Hour))), }, nil } diff --git a/pkg/health/health_pod.go b/pkg/health/health_pod.go index db12959..148cf7e 100644 --- a/pkg/health/health_pod.go +++ b/pkg/health/health_pod.go @@ -35,7 +35,7 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) { terminatingFor := time.Since(pod.ObjectMeta.DeletionTimestamp.Time) if terminatingFor >= time.Minute*15 { status = HealthWarning - message = fmt.Sprintf("stuck in 'Terminating' for %s", terminatingFor) + message = fmt.Sprintf("stuck in 'Terminating' for %s", terminatingFor.Truncate(time.Minute)) } return &HealthStatus{ diff --git a/pkg/health/health_statefulset.go b/pkg/health/health_statefulset.go index 54eeab2..3e1225c 100644 --- a/pkg/health/health_statefulset.go +++ b/pkg/health/health_statefulset.go @@ -2,12 +2,10 @@ package health import ( "fmt" - "strings" "time" appsv1 "k8s.io/api/apps/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - "k8s.io/apimachinery/pkg/runtime" ) func getStatefulSetHealth(obj *unstructured.Unstructured) (*HealthStatus, error) { @@ -15,9 +13,8 @@ func getStatefulSetHealth(obj *unstructured.Unstructured) (*HealthStatus, error) switch gvk { case appsv1.SchemeGroupVersion.WithKind(StatefulSetKind): var sts appsv1.StatefulSet - err := runtime.DefaultUnstructuredConverter.FromUnstructured(obj.Object, &sts) - if err != nil { - return nil, fmt.Errorf("failed to convert unstructured StatefulSet to typed: %v", err) + if err := convertFromUnstructured(obj, &sts); err != nil { + return nil, err } return getAppsv1StatefulSetHealth(&sts) default: @@ -35,61 +32,55 @@ func getAppsv1StatefulSetHealth(sts *appsv1.StatefulSet) (*HealthStatus, error) return &HealthStatus{ Status: HealthStatusScaledToZero, Health: HealthUnknown, + Ready: true, }, nil } - var containersWaitingForReadiness []string - for _, container := range sts.Spec.Template.Spec.Containers { - if container.ReadinessProbe != nil && container.ReadinessProbe.InitialDelaySeconds > 0 { - deadline := sts.CreationTimestamp.Add( - time.Second * time.Duration(container.ReadinessProbe.InitialDelaySeconds), - ) - if time.Now().Before(deadline) { - containersWaitingForReadiness = append(containersWaitingForReadiness, container.Name) - } - } - } - - if len(containersWaitingForReadiness) > 0 { - return &HealthStatus{ - Health: HealthUnknown, - Status: HealthStatusStarting, - Message: fmt.Sprintf( - "Container(s) %s is waiting for readiness probe", - strings.Join(containersWaitingForReadiness, ","), - ), - }, nil - } + startDeadline := GetStartDeadline(sts.Spec.Template.Spec.Containers...) + age := time.Since(sts.CreationTimestamp.Time).Truncate(time.Minute).Abs() health := HealthHealthy if sts.Status.ReadyReplicas == 0 { - health = HealthUnhealthy + if sts.Status.CurrentReplicas > 0 && age < startDeadline { + health = HealthUnknown + } else { + health = HealthUnhealthy + } } else if sts.Status.UpdatedReplicas == 0 { health = HealthWarning - } else if sts.Spec.Replicas != nil && sts.Status.ReadyReplicas >= *sts.Spec.Replicas { + } else if sts.Status.ReadyReplicas >= replicas { health = HealthHealthy } if sts.Spec.Replicas != nil && sts.Status.ReadyReplicas < *sts.Spec.Replicas { return &HealthStatus{ Health: health, - Status: HealthStatusRollingOut, + Status: HealthStatusStarting, Message: fmt.Sprintf("%d of %d pods ready", sts.Status.ReadyReplicas, *sts.Spec.Replicas), }, nil } - if sts.Spec.Replicas != nil && sts.Status.UpdatedReplicas < *sts.Spec.Replicas { + if sts.Spec.Replicas != nil && sts.Status.UpdatedReplicas < replicas { return &HealthStatus{ Health: health, Status: HealthStatusRollingOut, - Message: fmt.Sprintf("%d of %d pods updated", sts.Status.UpdatedReplicas, *sts.Spec.Replicas), + Message: fmt.Sprintf("%d of %d pods updated, %d of %d ready", sts.Status.UpdatedReplicas, replicas, sts.Status.ReadyReplicas, replicas), }, nil } if sts.Status.ObservedGeneration == 0 || sts.Generation > sts.Status.ObservedGeneration { return &HealthStatus{ - Health: health, - Status: HealthStatusRollingOut, + Health: health, + Status: HealthStatusRollingOut, + Message: fmt.Sprintf("generation not up to date %d", sts.Generation), + }, nil + } + + if sts.Status.UpdateRevision != "" && sts.Status.CurrentRevision != sts.Status.UpdateRevision { + return &HealthStatus{ + Health: health, + Status: HealthStatusRollingOut, + Message: fmt.Sprintf("revision not up to date %s", sts.Status.UpdateRevision), }, nil } diff --git a/pkg/health/health_test.go b/pkg/health/health_test.go index 55ee7b8..1c45582 100644 --- a/pkg/health/health_test.go +++ b/pkg/health/health_test.go @@ -6,18 +6,47 @@ package health_test import ( "os" + "sort" "strings" "testing" "time" "github.com/flanksource/is-healthy/pkg/health" - "github.com/flanksource/is-healthy/pkg/lua" + _ "github.com/flanksource/is-healthy/pkg/lua" + "github.com/samber/lo" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - "sigs.k8s.io/yaml" + "k8s.io/apimachinery/pkg/util/yaml" ) +const RFC3339Micro = "2006-01-02T15:04:05Z" + +var _now = time.Now().UTC() +var defaultOverrides = map[string]string{ + + "@now": _now.Format(RFC3339Micro), + "@now-1m": _now.Add(-time.Minute * 1).Format(RFC3339Micro), + "@now-10m": _now.Add(-time.Minute * 5).Format(RFC3339Micro), + "@now-15m": _now.Add(-time.Minute * 15).Format(RFC3339Micro), + + "@now-5m": _now.Add(-time.Minute * 5).Format(RFC3339Micro), + "@now-1h": _now.Add(-time.Hour).Format(RFC3339Micro), + "@now-2h": _now.Add(-time.Hour * 2).Format(RFC3339Micro), + "@now-4h": _now.Add(-time.Hour * 4).Format(RFC3339Micro), + "@now-8h": _now.Add(-time.Hour * 8).Format(RFC3339Micro), + "@now-1d": _now.Add(-time.Hour * 24).Format(RFC3339Micro), + "@now+10m": _now.Add(time.Minute * 10).Format(RFC3339Micro), + "@now+5m": _now.Add(time.Minute * 5).Format(RFC3339Micro), + "@now+15m": _now.Add(time.Minute * 15).Format(RFC3339Micro), + + "@now+1h": _now.Add(time.Hour).Format(RFC3339Micro), + "@now+2h": _now.Add(time.Hour * 2).Format(RFC3339Micro), + "@now+4h": _now.Add(time.Hour * 4).Format(RFC3339Micro), + "@now+8h": _now.Add(time.Hour * 8).Format(RFC3339Micro), + "@now+1d": _now.Add(time.Hour * 24).Format(RFC3339Micro), +} + func assertAppHealthMsg( t *testing.T, yamlPath string, @@ -25,8 +54,20 @@ func assertAppHealthMsg( expectedHealth health.Health, expectedReady bool, expectedMsg string, + overrides ...string, ) { - health := getHealthStatus(yamlPath, t, nil) + m := make(map[string]string) + for k, v := range defaultOverrides { + m[k] = v + } + for i := 0; i < len(overrides); i += 2 { + if v, ok := defaultOverrides[overrides[i+1]]; ok { + m[overrides[i]] = v + } else { + m[overrides[i]] = overrides[i+1] + } + } + health := getHealthStatus(yamlPath, t, m) assert.NotNil(t, health) assert.Equal(t, expectedHealth, health.Health) assert.Equal(t, expectedReady, health.Ready) @@ -40,8 +81,16 @@ func assertAppHealth( expectedStatus health.HealthStatusCode, expectedHealth health.Health, expectedReady bool, + overrides ...string, ) { - health := getHealthStatus(yamlPath, t, nil) + m := make(map[string]string) + for k, v := range defaultOverrides { + m[k] = v + } + for i := 0; i < len(overrides); i += 2 { + m[overrides[i]] = overrides[i+1] + } + health := getHealthStatus(yamlPath, t, m) assert.NotNil(t, health) assert.Equal(t, expectedHealth, health.Health) assert.Equal(t, expectedReady, health.Ready) @@ -81,19 +130,43 @@ func assertAppHealthWithOverwrite( } func getHealthStatus(yamlPath string, t *testing.T, overwrites map[string]string) *health.HealthStatus { + + if !strings.HasPrefix(yamlPath, "./testdata/") && !strings.HasPrefix(yamlPath, "../resource_customizations") { + yamlPath = "./testdata/" + yamlPath + } yamlBytes, err := os.ReadFile(yamlPath) require.NoError(t, err) - // Basic, search & replace overwrite - for k, v := range overwrites { - yamlBytes = []byte(strings.ReplaceAll(string(yamlBytes), k, v)) + yamlString := string(yamlBytes) + keys := lo.Keys(overwrites) + sort.Slice(keys, func(i, j int) bool { + return len(keys[i]) > len(keys[j]) + }) + + for _, k := range keys { + v := overwrites[k] + yamlString = strings.ReplaceAll(yamlString, k, v) + } + + //2nd iteration + for _, k := range keys { + v := overwrites[k] + yamlString = strings.ReplaceAll(yamlString, k, v) + } + + if strings.Contains(yamlPath, "::") { + configType := strings.Split(yamlPath, "/")[2] + var obj map[string]any + err = yaml.Unmarshal([]byte(yamlString), &obj) + require.NoError(t, err) + return lo.ToPtr(health.GetHealthByConfigType(configType, obj)) } var obj unstructured.Unstructured - err = yaml.Unmarshal(yamlBytes, &obj) + err = yaml.Unmarshal([]byte(yamlString), &obj) require.NoError(t, err) - health, err := health.GetResourceHealth(&obj, lua.ResourceHealthOverrides{}) + health, err := health.GetResourceHealth(&obj, health.DefaultOverrides) require.NoError(t, err) return health } @@ -217,7 +290,12 @@ func TestDeploymentHealth(t *testing.T) { } func TestStatefulSetHealth(t *testing.T) { - assertAppHealth(t, "./testdata/statefulset.yaml", health.HealthStatusRollingOut, health.HealthWarning, false) + assertAppHealthMsg(t, "./testdata/statefulset.yaml", health.HealthStatusRunning, health.HealthHealthy, true, "") + assertAppHealthMsg(t, "./testdata/statefulset-starting.yaml", health.HealthStatusStarting, health.HealthUnknown, false, "0 of 1 pods ready", "@now", "@now-1m") + assertAppHealthMsg(t, "./testdata/statefulset-starting.yaml", health.HealthStatusStarting, health.HealthUnknown, false, "0 of 1 pods ready", "@now", "@now-5m") + assertAppHealthMsg(t, "./testdata/statefulset-starting.yaml", health.HealthStatusStarting, health.HealthUnhealthy, false, "0 of 1 pods ready", "@now", "@now-15m") + assertAppHealthMsg(t, "./testdata/statefulset-starting.yaml", health.HealthStatusStarting, health.HealthUnhealthy, false, "0 of 1 pods ready", "@now", "@now-1d") + } func TestStatefulSetOnDeleteHealth(t *testing.T) { diff --git a/pkg/health/testdata/statefulset-starting.yaml b/pkg/health/testdata/statefulset-starting.yaml new file mode 100644 index 0000000..50d853d --- /dev/null +++ b/pkg/health/testdata/statefulset-starting.yaml @@ -0,0 +1,197 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + uid: ff7c99d6-86a8-4cbc-a367-80b14ffc4a03 + name: postgresql + labels: + helm.sh/chart: postgresql-16.1.1 + app.kubernetes.io/name: postgresql + app.kubernetes.io/version: 17.0.0 + app.kubernetes.io/instance: postgresql + app.kubernetes.io/component: primary + helm.toolkit.fluxcd.io/name: postgresql + app.kubernetes.io/managed-by: Helm + helm.toolkit.fluxcd.io/namespace: default + namespace: default + annotations: + meta.helm.sh/release-name: postgresql + meta.helm.sh/release-namespace: default + creationTimestamp: "@now" +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: postgresql + app.kubernetes.io/instance: postgresql + app.kubernetes.io/component: primary + template: + spec: + volumes: + - name: empty-dir + emptyDir: {} + - name: dshm + emptyDir: + medium: Memory + - name: data + emptyDir: {} + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 1 + podAffinityTerm: + topologyKey: kubernetes.io/hostname + labelSelector: + matchLabels: + app.kubernetes.io/name: postgresql + app.kubernetes.io/instance: postgresql + app.kubernetes.io/component: primary + dnsPolicy: ClusterFirst + containers: + - env: + - name: BITNAMI_DEBUG + value: "false" + - name: POSTGRESQL_PORT_NUMBER + value: "5432" + - name: POSTGRESQL_VOLUME_DIR + value: /bitnami/postgresql + - name: PGDATA + value: /bitnami/postgresql/data + - name: POSTGRES_USER + value: admin + - name: POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + key: password + name: postgresql + - name: POSTGRES_POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + key: postgres-password + name: postgresql + - name: POSTGRES_DATABASE + value: exampledb + - name: POSTGRESQL_ENABLE_LDAP + value: "no" + - name: POSTGRESQL_ENABLE_TLS + value: "no" + - name: POSTGRESQL_LOG_HOSTNAME + value: "false" + - name: POSTGRESQL_LOG_CONNECTIONS + value: "false" + - name: POSTGRESQL_LOG_DISCONNECTIONS + value: "false" + - name: POSTGRESQL_PGAUDIT_LOG_CATALOG + value: "off" + - name: POSTGRESQL_CLIENT_MIN_MESSAGES + value: error + - name: POSTGRESQL_SHARED_PRELOAD_LIBRARIES + value: pgaudit + name: postgresql + image: docker.io/bitnami/postgresql:17.0.0-debian-12-r9 + ports: + - name: tcp-postgresql + protocol: TCP + containerPort: 5432 + resources: + limits: + cpu: 150m + memory: 192Mi + ephemeral-storage: 2Gi + requests: + cpu: 100m + memory: 128Mi + ephemeral-storage: 50Mi + volumeMounts: + - name: empty-dir + subPath: tmp-dir + mountPath: /tmp + - name: empty-dir + subPath: app-conf-dir + mountPath: /opt/bitnami/postgresql/conf + - name: empty-dir + subPath: app-tmp-dir + mountPath: /opt/bitnami/postgresql/tmp + - name: dshm + mountPath: /dev/shm + - name: data + mountPath: /bitnami/postgresql + livenessProbe: + exec: + command: + - /bin/sh + - -c + - exec pg_isready -U "admin" -d "dbname=exampledb" -h 127.0.0.1 + -p 5432 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 6 + successThreshold: 1 + initialDelaySeconds: 30 + readinessProbe: + exec: + command: + - /bin/sh + - -c + - -e + - > + exec pg_isready -U "admin" -d "dbname=exampledb" -h 127.0.0.1 + -p 5432 + + [ -f /opt/bitnami/postgresql/tmp/.initialized ] || [ -f /bitnami/postgresql/.initialized ] + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 6 + successThreshold: 1 + initialDelaySeconds: 5 + imagePullPolicy: IfNotPresent + securityContext: + runAsUser: 1001 + privileged: false + runAsGroup: 1001 + capabilities: + drop: + - ALL + runAsNonRoot: true + seLinuxOptions: {} + seccompProfile: + type: RuntimeDefault + readOnlyRootFilesystem: true + allowPrivilegeEscalation: false + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + restartPolicy: Always + schedulerName: default-scheduler + serviceAccount: postgresql + securityContext: + fsGroup: 1001 + fsGroupChangePolicy: Always + serviceAccountName: postgresql + automountServiceAccountToken: false + terminationGracePeriodSeconds: 30 + metadata: + name: postgresql + labels: + helm.sh/chart: postgresql-16.1.1 + app.kubernetes.io/name: postgresql + app.kubernetes.io/version: 17.0.0 + app.kubernetes.io/instance: postgresql + app.kubernetes.io/component: primary + app.kubernetes.io/managed-by: Helm + serviceName: postgresql-hl + updateStrategy: + type: RollingUpdate + rollingUpdate: + partition: 0 + podManagementPolicy: OrderedReady + revisionHistoryLimit: 10 + persistentVolumeClaimRetentionPolicy: + whenScaled: Retain + whenDeleted: Retain +status: + replicas: 1 + collisionCount: 0 + updateRevision: postgresql-57fcd45cfb + currentReplicas: 1 + currentRevision: postgresql-57fcd45cfb + updatedReplicas: 1 + availableReplicas: 0 diff --git a/pkg/health/testdata/statefulset.yaml b/pkg/health/testdata/statefulset.yaml index 3f05859..1ecd85e 100644 --- a/pkg/health/testdata/statefulset.yaml +++ b/pkg/health/testdata/statefulset.yaml @@ -117,5 +117,6 @@ status: currentRevision: redis-master-7b8f75b98 observedGeneration: 1 readyReplicas: 1 + updatedReplicas: 1 replicas: 1 updateRevision: redis-master-7b8f75b98 diff --git a/pkg/health/utils.go b/pkg/health/utils.go index d4f3edd..87174ce 100644 --- a/pkg/health/utils.go +++ b/pkg/health/utils.go @@ -4,8 +4,11 @@ import ( "fmt" "time" + "k8s.io/apimachinery/pkg/util/json" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" ) const ( @@ -122,3 +125,36 @@ func getPodConditionFromList( } return -1, nil } + +func convertFromUnstructured[T any](o *unstructured.Unstructured, to *T) error { + js, err := json.Marshal(o) + if err != nil { + return fmt.Errorf("failed to marshal object to JSON: %w", err) + } + + if err = json.Unmarshal(js, to); err != nil { + return fmt.Errorf("failed to unmarshal object into: %T: %v", *to, err) + } + return nil +} + +// duration after the creation of a replica set +// within which we never deem the it to be unhealthy. +const PodStartingBufferPeriod = time.Minute * 10 + +func GetStartDeadline(containers ...corev1.Container) time.Duration { + max := PodStartingBufferPeriod + for _, container := range containers { + if readiness := container.ReadinessProbe; readiness != nil { + podLevel := time.Second * time.Duration(readiness.InitialDelaySeconds+readiness.FailureThreshold*(readiness.PeriodSeconds+readiness.TimeoutSeconds)) + if podLevel > max { + max = podLevel + } + } + } + return max.Truncate(time.Minute) +} + +func IsContainerStarting(creation time.Time, containers ...corev1.Container) bool { + return time.Since(creation) < GetStartDeadline(containers...) +} diff --git a/pkg/lua/lua.go b/pkg/lua/lua.go index ae908b8..1319e28 100644 --- a/pkg/lua/lua.go +++ b/pkg/lua/lua.go @@ -10,12 +10,12 @@ import ( "strings" "time" - "github.com/flanksource/is-healthy/pkg/health" lua "github.com/yuin/gopher-lua" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime/schema" luajson "layeh.com/gopher-json" + "github.com/flanksource/is-healthy/pkg/health" "github.com/flanksource/is-healthy/pkg/resource_customizations" ) @@ -30,6 +30,10 @@ const ( type ResourceHealthOverrides map[string]ResourceOverride +func init() { + health.DefaultOverrides = ResourceHealthOverrides{} +} + func (overrides ResourceHealthOverrides) GetResourceHealth( obj *unstructured.Unstructured, ) (*health.HealthStatus, error) {