Skip to content

Commit

Permalink
fix: health fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
moshloop committed Nov 20, 2024
1 parent 1fa31fe commit b8744bd
Show file tree
Hide file tree
Showing 40 changed files with 1,206 additions and 1,487 deletions.
2 changes: 1 addition & 1 deletion pkg/health/health_job.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ func getBatchv1JobHealth(job *batchv1.Job) (*HealthStatus, error) {
return &HealthStatus{
Ready: true,
Health: HealthUnhealthy,
Status: HealthStatusError,
Status: HealthStatusCode(condition.Reason),
Message: condition.Message,
}, nil
case batchv1.JobComplete:
Expand Down
58 changes: 28 additions & 30 deletions pkg/health/health_node.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
package health

import (
"fmt"

"github.com/samber/lo"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
)
Expand All @@ -13,37 +12,36 @@ func getNodeHealth(obj *unstructured.Unstructured) (*HealthStatus, error) {
return nil, err
}

for _, taint := range node.Spec.Taints {
if taint.Key == "node.kubernetes.io/unschedulable" && taint.Effect == "NoSchedule" {
return &HealthStatus{
Ready: false,
Health: HealthWarning,
Status: "Unschedulable",
}, nil
}
hs := HealthStatus{
Status: HealthStatusCode(node.Status.Phase),
Health: HealthUnknown,
}

for _, cond := range node.Status.Conditions {
if cond.Type == v1.NodeReady && cond.Status == v1.ConditionTrue {
return &HealthStatus{
Ready: true,
Health: HealthHealthy,
Status: HealthStatusHealthy,
}, nil
switch node.Status.Phase {
case v1.NodeRunning, "":
for _, cond := range node.Status.Conditions {
if cond.Type == v1.NodeReady {
if cond.Status == v1.ConditionTrue {
hs.Ready = true
hs.Status = lo.CoalesceOrEmpty(hs.Status, HealthStatusRunning)
hs.Health = hs.Health.Worst(HealthHealthy)
} else {
hs.Health = HealthUnhealthy
hs.Status = HealthStatusCode(HumanCase(string(cond.Type)))
hs.Message = cond.Message
}
} else if cond.Status == v1.ConditionTrue && cond.Type != "SysctlChanged" {
hs.Health = (HealthWarning)
hs.Status = HealthStatusCode(HumanCase(string(cond.Type)))
hs.Message = cond.Message
}
}

// All conditions apart from NodeReady should be false
if cond.Status == v1.ConditionTrue {
return &HealthStatus{
Status: HealthStatusDegraded,
Message: fmt.Sprintf("%s: %s", cond.Type, cond.Message),
}, nil
for _, taint := range node.Spec.Taints {
if taint.Key == "node.kubernetes.io/unschedulable" && taint.Effect == "NoSchedule" {
hs.Health = hs.Health.Worst(HealthWarning)
hs.Status = "Unschedulable"
}
}
}

return &HealthStatus{
Status: HealthStatusUnknown,
Health: HealthUnknown,
Message: "no conditions matched for node status",
}, nil
return &hs, nil
}
47 changes: 26 additions & 21 deletions pkg/health/health_pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,41 +34,46 @@ func getPodStatus(containers ...corev1.ContainerStatus) (waiting *HealthStatus,
} else if _waiting.Health.IsWorseThan(waiting.Health) {
waiting = _waiting
}
if _terminated != nil {
if terminated == nil {
terminated = _terminated
} else if _terminated.Health.IsWorseThan(terminated.Health) {
terminated = _terminated
}
}
if _terminated != nil {
if terminated == nil {
terminated = _terminated
} else if _terminated.Health.IsWorseThan(terminated.Health) {
terminated = _terminated
}
}
}
return waiting, terminated
}

func isErrorStatus(s string) bool {
return strings.HasPrefix(s, "Err") ||
strings.HasSuffix(s, "Error") ||
strings.HasSuffix(s, "BackOff")
}

func getContainerStatus(containerStatus corev1.ContainerStatus) (waiting *HealthStatus, terminated *HealthStatus) {
if state := containerStatus.State.Waiting; state != nil &&
(strings.HasPrefix(state.Reason, "Err") ||
strings.HasSuffix(state.Reason, "Error") ||
strings.HasSuffix(state.Reason, "BackOff")) {
if state := containerStatus.State.Waiting; state != nil {
waiting = &HealthStatus{
Status: HealthStatusCode(state.Reason),
Health: HealthUnhealthy,
Health: lo.Ternary(isErrorStatus(state.Reason) || containerStatus.RestartCount > 0, HealthUnhealthy, HealthUnknown),
Message: state.Message,
}
}

if state := containerStatus.LastTerminationState.Terminated; state != nil {
age := time.Since(state.FinishedAt.Time)
terminated = &HealthStatus{
Status: HealthStatusCode(state.Reason),
Health: HealthUnhealthy,
Message: state.Message,
}
if age >= time.Hour*24 {
terminated.Health = HealthUnknown
} else if age >= time.Hour {
terminated.Health = HealthWarning
// ignore old terminate statuses
if age < time.Hour*24 {
terminated = &HealthStatus{
Status: HealthStatusCode(state.Reason),
Health: lo.Ternary(age < time.Hour, HealthUnhealthy, HealthWarning),
Message: state.Message,
}
if state.Reason == string(HealthStatusCompleted) && state.ExitCode == 0 {
// completed successfully
terminated.Health = HealthHealthy
}
}
}
return waiting, terminated
Expand All @@ -81,7 +86,7 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) {
age := time.Since(pod.CreationTimestamp.Time).Truncate(time.Minute).Abs()
isStarting := age < deadline
var hr = HealthStatus{
Health: HealthUnknown,
Health: lo.Ternary(isReady, HealthHealthy, HealthUnhealthy),
}

if pod.ObjectMeta.DeletionTimestamp != nil && !pod.ObjectMeta.DeletionTimestamp.IsZero() {
Expand Down
113 changes: 0 additions & 113 deletions pkg/health/health_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -565,119 +565,6 @@ func TestReplicaSet(t *testing.T) {
}, health.HealthStatusStarting, health.HealthUnknown, false)
}

func TestPod(t *testing.T) {
assertAppHealthMsg(t, "./testdata/terminating-stuck.yaml", "TerminatingStalled", health.HealthWarning, false)
assertAppHealthMsg(t, "./testdata/terminating-namespace.yaml", "TerminatingStalled", health.HealthWarning, false)

assertAppHealthWithOverwrite(t, "./testdata/pod-terminating.yaml", map[string]string{
"2024-07-01T06:52:22Z": time.Now().Add(-time.Minute * 20).UTC().Format("2006-01-02T15:04:05Z"),
}, health.HealthStatusTerminating, health.HealthWarning, false)

assertAppHealthWithOverwrite(t, "./testdata/pod-deletion.yaml", map[string]string{
"2018-12-03T10:16:04Z": time.Now().Add(-time.Minute).Format("2006-01-02T15:04:05Z"),
}, health.HealthStatusTerminating, health.HealthUnknown, false)

assertAppHealthWithOverwriteMsg(t, "./testdata/pod-not-ready-container-not-ready.yaml", map[string]string{
"2024-07-29T06:32:56Z": time.Now().Add(time.Minute * 10).Format(time.RFC3339),
}, health.HealthStatusStarting, health.HealthUnknown, false, "Container nginx is waiting for readiness probe")

// Pod not ready
assertAppHealthMsg(
t,
"./testdata/pod-not-ready-but-container-ready.yaml",
health.HealthStatusRunning,
health.HealthWarning,
false,
)

// Restart Loop
assertAppHealthMsg(
t,
"./testdata/pod-ready-container-terminated.yaml",
health.HealthStatusRunning,
health.HealthHealthy,
true,
)

assertAppHealthWithOverwrite(t, "./testdata/pod-ready-container-terminated.yaml", map[string]string{
"2024-07-18T12:03:06Z": time.Now().
Add(-time.Minute * 50).
UTC().
Format("2006-01-02T15:04:05Z"),
// container last terminated
}, health.HealthStatusRunning, health.HealthWarning, false)

// Less than 30 minutes
assertAppHealthWithOverwrite(t, "./testdata/pod-high-restart-count.yaml", map[string]string{
"2024-07-17T14:29:51Z": time.Now().UTC().Add(-time.Minute).Format("2006-01-02T15:04:05Z"),
"2024-07-17T14:29:52Z": time.Now().UTC().Add(-time.Minute).Format("2006-01-02T15:04:05Z"), // start time
}, "OOMKilled", health.HealthUnhealthy, false)

// Less than 8 hours
assertAppHealthWithOverwrite(t, "./testdata/pod-high-restart-count.yaml", map[string]string{
"2024-07-17T14:29:51Z": time.Now().UTC().Add(-time.Hour).Format("2006-01-02T15:04:05Z"),
"2024-07-17T14:29:52Z": time.Now().UTC().Add(-time.Hour).Format("2006-01-02T15:04:05Z"), // start time
}, "OOMKilled", health.HealthWarning, false)

// More than 8 hours
assertAppHealthWithOverwrite(t, "./testdata/pod-high-restart-count.yaml", map[string]string{
"2024-07-17T14:29:51Z": "2024-06-17T14:29:51Z",
}, health.HealthStatusRunning, health.HealthHealthy, true)

assertAppHealthMsg(t, "./testdata/pod-old-restarts.yaml", health.HealthStatusRunning, health.HealthHealthy, true)

assertAppHealthMsg(t, "./testdata/pod-pending.yaml", health.HealthStatusPending, health.HealthUnknown, false)
assertAppHealthMsg(
t,
"./testdata/pod-running-not-ready.yaml",
health.HealthStatusStarting,
health.HealthUnknown,
false,
)
assertAppHealthMsg(
t,
"./testdata/pod-crashloop.yaml",
health.HealthStatusCrashLoopBackoff,
health.HealthUnhealthy,
false,
)
assertAppHealthMsg(
t,
"./testdata/pod-crashloop-pending.yaml",
health.HealthStatusCrashLoopBackoff,
health.HealthUnhealthy,
false,
)
assertAppHealthMsg(
t,
"./testdata/pod-running-restart-always.yaml",
health.HealthStatusRunning,
health.HealthHealthy,
true,
)
assertAppHealthMsg(
t,
"./testdata/pod-running-restart-never.yaml",
health.HealthStatusRunning,
health.HealthHealthy,
false,
)
assertAppHealthMsg(
t,
"./testdata/pod-running-restart-onfailure.yaml",
health.HealthStatusRunning,
health.HealthUnhealthy,
false,
)
assertAppHealthMsg(
t,
"./testdata/pod-init-container-fail.yaml",
health.HealthStatusCrashLoopBackoff,
health.HealthUnhealthy,
false,
)
}

// func TestAPIService(t *testing.T) {
// assertAppHealthMsg(t, "./testdata/apiservice-v1-true.yaml", HealthStatusHealthy, health.HealthHealthy, true)
// assertAppHealthMsg(t, "./testdata/apiservice-v1-false.yaml", HealthStatusProgressing, health.HealthHealthy, true)
Expand Down
30 changes: 30 additions & 0 deletions pkg/health/testdata/Kubernetes/Alert/healthy.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
apiVersion: notification.toolkit.fluxcd.io/v1beta2
kind: Alert
metadata:
uid: e9a7544d-5407-4ce1-9f01-31ba6a9bf4b0
name: hub-cluster
labels:
kustomize.toolkit.fluxcd.io/name: sre-infra-hub-entry
kustomize.toolkit.fluxcd.io/namespace: flux
namespace: flux-system
finalizers:
- finalizers.fluxcd.io
creationTimestamp: 2022-12-01T11:44:15Z
spec:
summary: bcb group hub cluster
providerRef:
name: slack
eventSources:
- kind: GitRepository
name: "*"
- kind: Kustomization
name: "*"
- kind: HelmRelease
name: "*"
eventSeverity: info
status:
conditions:
- type: Ready
reason: Succeeded
status: "True"
message: Initialized
65 changes: 65 additions & 0 deletions pkg/health/testdata/Kubernetes/Job/unhealthy.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
apiVersion: batch/v1
kind: Job
metadata:
uid: 6b1db17e-f7ff-4141-a435-b4fd96296505
name: always-failing-28868400
labels: {}
namespace: canaries
annotations:
batch.kubernetes.io/cronjob-scheduled-timestamp: 2024-11-20T12:00:00Z
expected-status: BackoffLimitExceeded
ownerReferences:
- uid: d0ae28d6-8ea2-49c9-8d37-f0147d7afa5f
kind: CronJob
name: always-failing
apiVersion: batch/v1
controller: true
blockOwnerDeletion: true
creationTimestamp: 2024-11-20T12:00:00Z
spec:
suspend: false
selector:
matchLabels:
batch.kubernetes.io/controller-uid: 6b1db17e-f7ff-4141-a435-b4fd96296505
template:
spec:
dnsPolicy: ClusterFirst
containers:
- name: fail
image: busybox:1.28
command:
- /bin/sh
- -c
- exit 1
resources: {}
imagePullPolicy: IfNotPresent
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
restartPolicy: OnFailure
schedulerName: default-scheduler
securityContext: {}
terminationGracePeriodSeconds: 30
metadata:
labels:
job-name: always-failing-28868400
controller-uid: 6b1db17e-f7ff-4141-a435-b4fd96296505
batch.kubernetes.io/job-name: always-failing-28868400
batch.kubernetes.io/controller-uid: 6b1db17e-f7ff-4141-a435-b4fd96296505
completions: 1
parallelism: 1
backoffLimit: 1
completionMode: NonIndexed
manualSelector: false
podReplacementPolicy: TerminatingOrFailed
status:
ready: 0
failed: 1
startTime: 2024-11-20T12:00:00Z
conditions:
- type: Failed
reason: BackoffLimitExceeded
status: "True"
message: Job has reached the specified backoff limit
lastProbeTime: 2024-11-20T12:00:02Z
terminating: 0
uncountedTerminatedPods: {}
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
apiVersion: v1
kind: Namespace
metadata:
name: stuck-namespace
annotations:
expected-status: TerminatingStalled
expected-health: warning
expected-message: terminating for 1h
creationTimestamp: 2024-08-09T02:00:00Z
deletionTimestamp: "@now-1h"
Loading

0 comments on commit b8744bd

Please sign in to comment.