Skip to content

Commit

Permalink
feat: terminating stalled
Browse files Browse the repository at this point in the history
  • Loading branch information
adityathebe committed Aug 19, 2024
1 parent 3be957a commit 03a145e
Show file tree
Hide file tree
Showing 4 changed files with 138 additions and 8 deletions.
11 changes: 11 additions & 0 deletions pkg/health/health.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
package health

import (
"fmt"
"strings"
"time"

"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/runtime/schema"
Expand Down Expand Up @@ -108,6 +110,15 @@ func GetHealthByConfigType(configType string, obj map[string]any) HealthStatus {

// GetResourceHealth returns the health of a k8s resource
func GetResourceHealth(obj *unstructured.Unstructured, healthOverride HealthOverride) (health *HealthStatus, err error) {
if obj.GetDeletionTimestamp() != nil && !obj.GetDeletionTimestamp().IsZero() && time.Since(obj.GetDeletionTimestamp().Time) > time.Hour {
terminatingFor := time.Since(obj.GetDeletionTimestamp().Time)
return &HealthStatus{
Status: "TerminatingStalled",
Health: HealthUnhealthy,
Message: fmt.Sprintf("Resource is terminating, time since deletion: %v", terminatingFor),
}, nil
}

if healthCheck := GetHealthCheckFunc(obj.GroupVersionKind()); healthCheck != nil {
if health, err = healthCheck(obj); err != nil {
health = &HealthStatus{
Expand Down
19 changes: 11 additions & 8 deletions pkg/health/health_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,17 @@ func TestKustomization(t *testing.T) {
}

func TestPod(t *testing.T) {
assertAppHealth(t, "./testdata/terminating-stuck.yaml", "TerminatingStalled", health.HealthUnhealthy, false)
assertAppHealth(t, "./testdata/terminating-namespace.yaml", "TerminatingStalled", health.HealthUnhealthy, false)

assertAppHealthWithOverwrite(t, "./testdata/pod-terminating.yaml", map[string]string{
"2024-07-01T06:52:22Z": time.Now().Add(-time.Minute * 20).UTC().Format("2006-01-02T15:04:05Z"),
}, health.HealthStatusTerminating, health.HealthWarning, false)

assertAppHealthWithOverwrite(t, "./testdata/pod-deletion.yaml", map[string]string{
"2018-12-03T10:16:04Z": time.Now().Add(-time.Minute).Format("2006-01-02T15:04:05Z"),
}, health.HealthStatusTerminating, health.HealthUnknown, false)

assertAppHealthWithOverwriteMsg(t, "./testdata/pod-not-ready-container-not-ready.yaml", map[string]string{
"2024-07-29T06:32:56Z": time.Now().Add(time.Minute * 10).Format(time.RFC3339),
}, health.HealthStatusStarting, health.HealthUnknown, false, "Container nginx is waiting for readiness probe")
Expand Down Expand Up @@ -201,10 +212,6 @@ func TestPod(t *testing.T) {

assertAppHealth(t, "./testdata/pod-old-restarts.yaml", health.HealthStatusRunning, health.HealthHealthy, true)

assertAppHealth(t, "./testdata/pod-terminating.yaml", health.HealthStatusTerminating, health.HealthWarning, false)
status := getHealthStatus("./testdata/pod-terminating.yaml", t, nil)
assert.Contains(t, status.Message, "stuck in 'Terminating' for")

assertAppHealth(t, "./testdata/pod-pending.yaml", health.HealthStatusPending, health.HealthUnknown, false)
assertAppHealth(t, "./testdata/pod-running-not-ready.yaml", health.HealthStatusStarting, health.HealthUnknown, false)
assertAppHealth(t, "./testdata/pod-crashloop.yaml", health.HealthStatusCrashLoopBackoff, health.HealthUnhealthy, false)
Expand All @@ -216,10 +223,6 @@ func TestPod(t *testing.T) {
assertAppHealth(t, "./testdata/pod-failed.yaml", health.HealthStatusError, health.HealthUnhealthy, true)
assertAppHealth(t, "./testdata/pod-succeeded.yaml", health.HealthStatusCompleted, health.HealthHealthy, true)
assertAppHealth(t, "./testdata/pod-init-container-fail.yaml", health.HealthStatusCrashLoopBackoff, health.HealthUnhealthy, false)

assertAppHealthWithOverwrite(t, "./testdata/pod-deletion.yaml", map[string]string{
"2018-12-03T10:16:04Z": time.Now().Add(-time.Minute).Format("2006-01-02T15:04:05Z"),
}, health.HealthStatusTerminating, health.HealthUnknown, false)
}

// func TestAPIService(t *testing.T) {
Expand Down
6 changes: 6 additions & 0 deletions pkg/health/testdata/terminating-namespace.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
apiVersion: v1
kind: Namespace
metadata:
name: stuck-namespace
creationTimestamp: 2024-08-09T02:00:00Z
deletionTimestamp: 2024-08-09T03:00:05Z
110 changes: 110 additions & 0 deletions pkg/health/testdata/terminating-stuck.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
apiVersion: v1
kind: Pod
metadata:
uid: fce251e7-c46f-4e3b-a9b6-05ad67de270c
name: aws-fargate-alert-28719480-qzc65
labels: {}
namespace: management
generateName: aws-fargate-alert-28719480-
ownerReferences:
- uid: 9939fcbb-8ffd-4c51-b01b-bff8092db71d
kind: Job
name: aws-fargate-alert-28719480
apiVersion: batch/v1
controller: true
blockOwnerDeletion: true
creationTimestamp: 2024-08-09T02:00:00Z
deletionTimestamp: 2024-08-09T03:00:05Z
deletionGracePeriodSeconds: 0
spec:
volumes:
- name: kube-api-access-phfbf
projected:
sources:
- serviceAccountToken:
path: token
expirationSeconds: 3607
- configMap:
name: kube-root-ca.crt
items:
- key: ca.crt
path: ca.crt
- downwardAPI:
items:
- path: namespace
fieldRef:
fieldPath: metadata.namespace
apiVersion: v1
defaultMode: 420
nodeName: ip-168-1-68-139.eu-west-1.compute.internal
priority: 0
dnsPolicy: ClusterFirst
containers:
- name: aws-fargate-alert
image: docker.infoslipscloud.com/iiab-cronjobs
command:
- python
- /cronjobs/aws_fargate_alert.py
resources: {}
volumeMounts:
- name: kube-api-access-phfbf
readOnly: true
mountPath: /var/run/secrets/kubernetes.io/serviceaccount
imagePullPolicy: Always
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
tolerations:
- key: node.kubernetes.io/not-ready
effect: NoExecute
operator: Exists
tolerationSeconds: 300
- key: node.kubernetes.io/unreachable
effect: NoExecute
operator: Exists
tolerationSeconds: 300
restartPolicy: Never
schedulerName: default-scheduler
serviceAccount: default
securityContext: {}
imagePullSecrets:
- name: docker-auth
preemptionPolicy: PreemptLowerPriority
enableServiceLinks: true
serviceAccountName: default
terminationGracePeriodSeconds: 30
status:
phase: Succeeded
podIP: 168.1.92.238
hostIP: 168.1.68.139
podIPs:
- ip: 168.1.92.238
qosClass: BestEffort
startTime: 2024-08-09T02:00:00Z
conditions:
- type: Initialized
reason: PodCompleted
status: "True"
- type: Ready
reason: PodCompleted
status: "False"
- type: ContainersReady
reason: PodCompleted
status: "False"
- type: PodScheduled
status: "True"
containerStatuses:
- name: aws-fargate-alert
image: docker.infoslipscloud.com/iiab-cronjobs:latest
ready: false
state:
terminated:
reason: Completed
exitCode: 0
startedAt: 2024-08-09T02:00:01Z
finishedAt: 2024-08-09T02:00:02Z
containerID: containerd://a878069fc3ae58b76423d3dffbb4fc57959bb2e68534ce1cb88e468478cfc55a
imageID: docker.infoslipscloud.com/iiab-cronjobs@sha256:9a560d72c176e0b77133f8df13acb3a3a761fbecbf5671c05a2d8b8b05450bc9
started: false
lastState: {}
containerID: containerd://a878069fc3ae58b76423d3dffbb4fc57959bb2e68534ce1cb88e468478cfc55a
restartCount: 0

0 comments on commit 03a145e

Please sign in to comment.