Skip to content

Commit

Permalink
chore: improve statefulset health
Browse files Browse the repository at this point in the history
  • Loading branch information
moshloop committed Nov 8, 2024
1 parent be8d928 commit 5939f38
Show file tree
Hide file tree
Showing 8 changed files with 387 additions and 50 deletions.
38 changes: 34 additions & 4 deletions pkg/health/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,14 @@ import (
"strings"
"time"

"github.com/samber/lo"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/util/duration"
)

var DefaultOverrides HealthOverride

type Health string

const (
Expand Down Expand Up @@ -40,6 +44,7 @@ const (
HealthStatusEvicted HealthStatusCode = "Evicted"
HealthStatusCompleted HealthStatusCode = "Completed"
HealthStatusCrashLoopBackoff HealthStatusCode = "CrashLoopBackOff"
HealthStatusCrashed HealthStatusCode = "Crashed"
HealthStatusCreating HealthStatusCode = "Creating"
HealthStatusDeleted HealthStatusCode = "Deleted"
HealthStatusDeleting HealthStatusCode = "Deleting"
Expand Down Expand Up @@ -100,12 +105,37 @@ func IsWorse(current, new HealthStatusCode) bool {
return newIndex > currentIndex
}

func GetHealthByConfigType(configType string, obj map[string]any) HealthStatus {
if strings.HasPrefix(configType, "Mongo::") {
func GetHealthByConfigType(configType string, obj map[string]any, states ...string) HealthStatus {
switch configType {
case "AWS::ECS::Task":
return GetECSTaskHealth(obj)
}

configClass := strings.Split(configType, "::")[0]

switch strings.ToLower(configClass) {
case "mongo":
return GetMongoHealth(obj)
case "kubernetes", "crossplane", "missioncontrol", "flux", "argo":
hr, err :=
GetResourceHealth(&unstructured.Unstructured{Object: obj}, DefaultOverrides)
if hr != nil {
return *hr
}
if err != nil {
return HealthStatus{
Status: "HealthParseError",
Message: lo.Elipse(err.Error(), 500),
}
}
}

return HealthStatus{}
if len(states) > 0 {
return GetHealthFromStatusName(states[0])
}
return HealthStatus{
Health: HealthUnknown,
}
}

// GetResourceHealth returns the health of a k8s resource
Expand All @@ -119,7 +149,7 @@ func GetResourceHealth(
return &HealthStatus{
Status: "TerminatingStalled",
Health: HealthUnhealthy,
Message: fmt.Sprintf("Resource is terminating, time since deletion: %v", terminatingFor),
Message: fmt.Sprintf("terminating for %v", duration.ShortHumanDuration(terminatingFor.Truncate(time.Hour))),
}, nil
}

Expand Down
2 changes: 1 addition & 1 deletion pkg/health/health_pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) {
terminatingFor := time.Since(pod.ObjectMeta.DeletionTimestamp.Time)
if terminatingFor >= time.Minute*15 {
status = HealthWarning
message = fmt.Sprintf("stuck in 'Terminating' for %s", terminatingFor)
message = fmt.Sprintf("stuck in 'Terminating' for %s", terminatingFor.Truncate(time.Minute))
}

return &HealthStatus{
Expand Down
59 changes: 25 additions & 34 deletions pkg/health/health_statefulset.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,19 @@ package health

import (
"fmt"
"strings"
"time"

appsv1 "k8s.io/api/apps/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/runtime"
)

func getStatefulSetHealth(obj *unstructured.Unstructured) (*HealthStatus, error) {
gvk := obj.GroupVersionKind()
switch gvk {
case appsv1.SchemeGroupVersion.WithKind(StatefulSetKind):
var sts appsv1.StatefulSet
err := runtime.DefaultUnstructuredConverter.FromUnstructured(obj.Object, &sts)
if err != nil {
return nil, fmt.Errorf("failed to convert unstructured StatefulSet to typed: %v", err)
if err := convertFromUnstructured(obj, &sts); err != nil {
return nil, err
}
return getAppsv1StatefulSetHealth(&sts)
default:
Expand All @@ -35,61 +32,55 @@ func getAppsv1StatefulSetHealth(sts *appsv1.StatefulSet) (*HealthStatus, error)
return &HealthStatus{
Status: HealthStatusScaledToZero,
Health: HealthUnknown,
Ready: true,
}, nil
}

var containersWaitingForReadiness []string
for _, container := range sts.Spec.Template.Spec.Containers {
if container.ReadinessProbe != nil && container.ReadinessProbe.InitialDelaySeconds > 0 {
deadline := sts.CreationTimestamp.Add(
time.Second * time.Duration(container.ReadinessProbe.InitialDelaySeconds),
)
if time.Now().Before(deadline) {
containersWaitingForReadiness = append(containersWaitingForReadiness, container.Name)
}
}
}

if len(containersWaitingForReadiness) > 0 {
return &HealthStatus{
Health: HealthUnknown,
Status: HealthStatusStarting,
Message: fmt.Sprintf(
"Container(s) %s is waiting for readiness probe",
strings.Join(containersWaitingForReadiness, ","),
),
}, nil
}
startDeadline := GetStartDeadline(sts.Spec.Template.Spec.Containers...)
age := time.Since(sts.CreationTimestamp.Time).Truncate(time.Minute).Abs()

health := HealthHealthy
if sts.Status.ReadyReplicas == 0 {
health = HealthUnhealthy
if sts.Status.CurrentReplicas > 0 && age < startDeadline {
health = HealthUnknown
} else {
health = HealthUnhealthy
}
} else if sts.Status.UpdatedReplicas == 0 {
health = HealthWarning
} else if sts.Spec.Replicas != nil && sts.Status.ReadyReplicas >= *sts.Spec.Replicas {
} else if sts.Status.ReadyReplicas >= replicas {
health = HealthHealthy
}

if sts.Spec.Replicas != nil && sts.Status.ReadyReplicas < *sts.Spec.Replicas {
return &HealthStatus{
Health: health,
Status: HealthStatusRollingOut,
Status: HealthStatusStarting,
Message: fmt.Sprintf("%d of %d pods ready", sts.Status.ReadyReplicas, *sts.Spec.Replicas),
}, nil
}

if sts.Spec.Replicas != nil && sts.Status.UpdatedReplicas < *sts.Spec.Replicas {
if sts.Spec.Replicas != nil && sts.Status.UpdatedReplicas < replicas {
return &HealthStatus{
Health: health,
Status: HealthStatusRollingOut,
Message: fmt.Sprintf("%d of %d pods updated", sts.Status.UpdatedReplicas, *sts.Spec.Replicas),
Message: fmt.Sprintf("%d of %d pods updated, %d of %d ready", sts.Status.UpdatedReplicas, replicas, sts.Status.ReadyReplicas, replicas),
}, nil
}

if sts.Status.ObservedGeneration == 0 || sts.Generation > sts.Status.ObservedGeneration {
return &HealthStatus{
Health: health,
Status: HealthStatusRollingOut,
Health: health,
Status: HealthStatusRollingOut,
Message: fmt.Sprintf("generation not up to date %d", sts.Generation),
}, nil
}

if sts.Status.UpdateRevision != "" && sts.Status.CurrentRevision != sts.Status.UpdateRevision {
return &HealthStatus{
Health: health,
Status: HealthStatusRollingOut,
Message: fmt.Sprintf("revision not up to date %s", sts.Status.UpdateRevision),
}, nil
}

Expand Down
98 changes: 88 additions & 10 deletions pkg/health/health_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,27 +6,68 @@ package health_test

import (
"os"
"sort"
"strings"
"testing"
"time"

"github.com/flanksource/is-healthy/pkg/health"
"github.com/flanksource/is-healthy/pkg/lua"
_ "github.com/flanksource/is-healthy/pkg/lua"
"github.com/samber/lo"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"sigs.k8s.io/yaml"
"k8s.io/apimachinery/pkg/util/yaml"
)

const RFC3339Micro = "2006-01-02T15:04:05Z"

var _now = time.Now().UTC()
var defaultOverrides = map[string]string{

"@now": _now.Format(RFC3339Micro),
"@now-1m": _now.Add(-time.Minute * 1).Format(RFC3339Micro),
"@now-10m": _now.Add(-time.Minute * 5).Format(RFC3339Micro),
"@now-15m": _now.Add(-time.Minute * 15).Format(RFC3339Micro),

"@now-5m": _now.Add(-time.Minute * 5).Format(RFC3339Micro),
"@now-1h": _now.Add(-time.Hour).Format(RFC3339Micro),
"@now-2h": _now.Add(-time.Hour * 2).Format(RFC3339Micro),
"@now-4h": _now.Add(-time.Hour * 4).Format(RFC3339Micro),
"@now-8h": _now.Add(-time.Hour * 8).Format(RFC3339Micro),
"@now-1d": _now.Add(-time.Hour * 24).Format(RFC3339Micro),
"@now+10m": _now.Add(time.Minute * 10).Format(RFC3339Micro),
"@now+5m": _now.Add(time.Minute * 5).Format(RFC3339Micro),
"@now+15m": _now.Add(time.Minute * 15).Format(RFC3339Micro),

"@now+1h": _now.Add(time.Hour).Format(RFC3339Micro),
"@now+2h": _now.Add(time.Hour * 2).Format(RFC3339Micro),
"@now+4h": _now.Add(time.Hour * 4).Format(RFC3339Micro),
"@now+8h": _now.Add(time.Hour * 8).Format(RFC3339Micro),
"@now+1d": _now.Add(time.Hour * 24).Format(RFC3339Micro),
}

func assertAppHealthMsg(
t *testing.T,
yamlPath string,
expectedStatus health.HealthStatusCode,
expectedHealth health.Health,
expectedReady bool,
expectedMsg string,
overrides ...string,
) {
health := getHealthStatus(yamlPath, t, nil)
m := make(map[string]string)
for k, v := range defaultOverrides {
m[k] = v
}
for i := 0; i < len(overrides); i += 2 {
if v, ok := defaultOverrides[overrides[i+1]]; ok {
m[overrides[i]] = v
} else {
m[overrides[i]] = overrides[i+1]
}
}
health := getHealthStatus(yamlPath, t, m)
assert.NotNil(t, health)
assert.Equal(t, expectedHealth, health.Health)
assert.Equal(t, expectedReady, health.Ready)
Expand All @@ -40,8 +81,16 @@ func assertAppHealth(
expectedStatus health.HealthStatusCode,
expectedHealth health.Health,
expectedReady bool,
overrides ...string,
) {
health := getHealthStatus(yamlPath, t, nil)
m := make(map[string]string)
for k, v := range defaultOverrides {
m[k] = v
}
for i := 0; i < len(overrides); i += 2 {
m[overrides[i]] = overrides[i+1]
}
health := getHealthStatus(yamlPath, t, m)
assert.NotNil(t, health)
assert.Equal(t, expectedHealth, health.Health)
assert.Equal(t, expectedReady, health.Ready)
Expand Down Expand Up @@ -81,19 +130,43 @@ func assertAppHealthWithOverwrite(
}

func getHealthStatus(yamlPath string, t *testing.T, overwrites map[string]string) *health.HealthStatus {

if !strings.HasPrefix(yamlPath, "./testdata/") && !strings.HasPrefix(yamlPath, "../resource_customizations") {
yamlPath = "./testdata/" + yamlPath
}
yamlBytes, err := os.ReadFile(yamlPath)
require.NoError(t, err)

// Basic, search & replace overwrite
for k, v := range overwrites {
yamlBytes = []byte(strings.ReplaceAll(string(yamlBytes), k, v))
yamlString := string(yamlBytes)
keys := lo.Keys(overwrites)
sort.Slice(keys, func(i, j int) bool {
return len(keys[i]) > len(keys[j])
})

for _, k := range keys {
v := overwrites[k]
yamlString = strings.ReplaceAll(yamlString, k, v)
}

//2nd iteration
for _, k := range keys {
v := overwrites[k]
yamlString = strings.ReplaceAll(yamlString, k, v)
}

if strings.Contains(yamlPath, "::") {
configType := strings.Split(yamlPath, "/")[2]
var obj map[string]any
err = yaml.Unmarshal([]byte(yamlString), &obj)
require.NoError(t, err)
return lo.ToPtr(health.GetHealthByConfigType(configType, obj))
}

var obj unstructured.Unstructured
err = yaml.Unmarshal(yamlBytes, &obj)
err = yaml.Unmarshal([]byte(yamlString), &obj)
require.NoError(t, err)

health, err := health.GetResourceHealth(&obj, lua.ResourceHealthOverrides{})
health, err := health.GetResourceHealth(&obj, health.DefaultOverrides)
require.NoError(t, err)
return health
}
Expand Down Expand Up @@ -217,7 +290,12 @@ func TestDeploymentHealth(t *testing.T) {
}

func TestStatefulSetHealth(t *testing.T) {
assertAppHealth(t, "./testdata/statefulset.yaml", health.HealthStatusRollingOut, health.HealthWarning, false)
assertAppHealthMsg(t, "./testdata/statefulset.yaml", health.HealthStatusRunning, health.HealthHealthy, true, "")
assertAppHealthMsg(t, "./testdata/statefulset-starting.yaml", health.HealthStatusStarting, health.HealthUnknown, false, "0 of 1 pods ready", "@now", "@now-1m")
assertAppHealthMsg(t, "./testdata/statefulset-starting.yaml", health.HealthStatusStarting, health.HealthUnknown, false, "0 of 1 pods ready", "@now", "@now-5m")
assertAppHealthMsg(t, "./testdata/statefulset-starting.yaml", health.HealthStatusStarting, health.HealthUnhealthy, false, "0 of 1 pods ready", "@now", "@now-15m")
assertAppHealthMsg(t, "./testdata/statefulset-starting.yaml", health.HealthStatusStarting, health.HealthUnhealthy, false, "0 of 1 pods ready", "@now", "@now-1d")

}

func TestStatefulSetOnDeleteHealth(t *testing.T) {
Expand Down
Loading

0 comments on commit 5939f38

Please sign in to comment.