Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: terminating stalled #94

Merged
merged 1 commit into from
Aug 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions pkg/health/health.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
package health

import (
"fmt"
"strings"
"time"

"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/runtime/schema"
Expand Down Expand Up @@ -108,6 +110,15 @@ func GetHealthByConfigType(configType string, obj map[string]any) HealthStatus {

// GetResourceHealth returns the health of a k8s resource
func GetResourceHealth(obj *unstructured.Unstructured, healthOverride HealthOverride) (health *HealthStatus, err error) {
if obj.GetDeletionTimestamp() != nil && !obj.GetDeletionTimestamp().IsZero() && time.Since(obj.GetDeletionTimestamp().Time) > time.Hour {
terminatingFor := time.Since(obj.GetDeletionTimestamp().Time)
return &HealthStatus{
Status: "TerminatingStalled",
Health: HealthUnhealthy,
Message: fmt.Sprintf("Resource is terminating, time since deletion: %v", terminatingFor),
}, nil
}

if healthCheck := GetHealthCheckFunc(obj.GroupVersionKind()); healthCheck != nil {
if health, err = healthCheck(obj); err != nil {
health = &HealthStatus{
Expand Down
19 changes: 11 additions & 8 deletions pkg/health/health_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,17 @@ func TestKustomization(t *testing.T) {
}

func TestPod(t *testing.T) {
assertAppHealth(t, "./testdata/terminating-stuck.yaml", "TerminatingStalled", health.HealthUnhealthy, false)
assertAppHealth(t, "./testdata/terminating-namespace.yaml", "TerminatingStalled", health.HealthUnhealthy, false)

assertAppHealthWithOverwrite(t, "./testdata/pod-terminating.yaml", map[string]string{
"2024-07-01T06:52:22Z": time.Now().Add(-time.Minute * 20).UTC().Format("2006-01-02T15:04:05Z"),
}, health.HealthStatusTerminating, health.HealthWarning, false)

assertAppHealthWithOverwrite(t, "./testdata/pod-deletion.yaml", map[string]string{
"2018-12-03T10:16:04Z": time.Now().Add(-time.Minute).Format("2006-01-02T15:04:05Z"),
}, health.HealthStatusTerminating, health.HealthUnknown, false)

assertAppHealthWithOverwriteMsg(t, "./testdata/pod-not-ready-container-not-ready.yaml", map[string]string{
"2024-07-29T06:32:56Z": time.Now().Add(time.Minute * 10).Format(time.RFC3339),
}, health.HealthStatusStarting, health.HealthUnknown, false, "Container nginx is waiting for readiness probe")
Expand Down Expand Up @@ -201,10 +212,6 @@ func TestPod(t *testing.T) {

assertAppHealth(t, "./testdata/pod-old-restarts.yaml", health.HealthStatusRunning, health.HealthHealthy, true)

assertAppHealth(t, "./testdata/pod-terminating.yaml", health.HealthStatusTerminating, health.HealthWarning, false)
status := getHealthStatus("./testdata/pod-terminating.yaml", t, nil)
assert.Contains(t, status.Message, "stuck in 'Terminating' for")

assertAppHealth(t, "./testdata/pod-pending.yaml", health.HealthStatusPending, health.HealthUnknown, false)
assertAppHealth(t, "./testdata/pod-running-not-ready.yaml", health.HealthStatusStarting, health.HealthUnknown, false)
assertAppHealth(t, "./testdata/pod-crashloop.yaml", health.HealthStatusCrashLoopBackoff, health.HealthUnhealthy, false)
Expand All @@ -216,10 +223,6 @@ func TestPod(t *testing.T) {
assertAppHealth(t, "./testdata/pod-failed.yaml", health.HealthStatusError, health.HealthUnhealthy, true)
assertAppHealth(t, "./testdata/pod-succeeded.yaml", health.HealthStatusCompleted, health.HealthHealthy, true)
assertAppHealth(t, "./testdata/pod-init-container-fail.yaml", health.HealthStatusCrashLoopBackoff, health.HealthUnhealthy, false)

assertAppHealthWithOverwrite(t, "./testdata/pod-deletion.yaml", map[string]string{
"2018-12-03T10:16:04Z": time.Now().Add(-time.Minute).Format("2006-01-02T15:04:05Z"),
}, health.HealthStatusTerminating, health.HealthUnknown, false)
}

// func TestAPIService(t *testing.T) {
Expand Down
6 changes: 6 additions & 0 deletions pkg/health/testdata/terminating-namespace.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
apiVersion: v1
kind: Namespace
metadata:
name: stuck-namespace
creationTimestamp: 2024-08-09T02:00:00Z
deletionTimestamp: 2024-08-09T03:00:05Z
128 changes: 128 additions & 0 deletions pkg/health/testdata/terminating-stuck.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
apiVersion: v1
kind: Pod
metadata:
uid: fce251e7-c46f-4e3b-a9b6-05ad67de270c
name: alert-manager-qzc65
labels: {}
namespace: management
generateName: alert-manager-
ownerReferences:
- uid: 9939fcbb-8ffd-4c51-b01b-bff8092db71d
kind: Job
name: alert-manager
apiVersion: batch/v1
controller: true
blockOwnerDeletion: true
creationTimestamp: 2024-08-09T02:00:00Z
deletionTimestamp: 2024-08-09T03:00:05Z
deletionGracePeriodSeconds: 0
spec:
volumes:
- name: alertmanager-storage
persistentVolumeClaim:
claimName: alertmanager-storage-alertmanager-0
- name: config
configMap:
name: alertmanager-configmap
defaultMode: 420
- name: kube-api-access-2fklr
projected:
sources:
- serviceAccountToken:
path: token
expirationSeconds: 3607
- configMap:
name: kube-root-ca.crt
items:
- key: ca.crt
path: ca.crt
- downwardAPI:
items:
- path: namespace
fieldRef:
fieldPath: metadata.namespace
apiVersion: v1
defaultMode: 420
hostname: alertmanager-0
nodeName: esr
priority: 0
dnsPolicy: ClusterFirst
subdomain: alertmanager
containers:
- name: alertmanager
image: quay.io/prometheus/alertmanager:v0.27.0
ports:
- name: http
protocol: TCP
containerPort: 9093
resources:
limits:
memory: 99M
requests:
cpu: 11m
memory: 50M
volumeMounts:
- name: alertmanager-storage
mountPath: /alertmanager
- name: config
mountPath: /etc/alertmanager
- name: kube-api-access-2fklr
readOnly: true
mountPath: /var/run/secrets/kubernetes.io/serviceaccount
imagePullPolicy: IfNotPresent
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
tolerations:
- key: node.kubernetes.io/not-ready
effect: NoExecute
operator: Exists
tolerationSeconds: 300
- key: node.kubernetes.io/unreachable
effect: NoExecute
operator: Exists
tolerationSeconds: 300
restartPolicy: Always
schedulerName: default-scheduler
serviceAccount: default
securityContext: {}
preemptionPolicy: PreemptLowerPriority
enableServiceLinks: false
serviceAccountName: default
automountServiceAccountToken: true
terminationGracePeriodSeconds: 30
status:
phase: Succeeded
podIP: 99.99.99.99
hostIP: 99.99.99.99
podIPs:
- ip: 99.99.99.99
qosClass: BestEffort
startTime: 2024-08-09T02:00:00Z
conditions:
- type: Initialized
reason: PodCompleted
status: "True"
- type: Ready
reason: PodCompleted
status: "False"
- type: ContainersReady
reason: PodCompleted
status: "False"
- type: PodScheduled
status: "True"
containerStatuses:
- name: aws-fargate-alert
image: flanksource.com/iiab-cronjobs:latest
ready: false
state:
terminated:
reason: Completed
exitCode: 0
startedAt: 2024-08-09T02:00:01Z
finishedAt: 2024-08-09T02:00:02Z
containerID: containerd://a878069fc3ae58b76423d3dffbb4fc57959bb2e68534ce1cb88e468478cfc55a
imageID: flanksource.com/iiab-cronjobs@sha256:9a560d72c176e0b77133f8df13acb3a3a761fbecbf5671c05a2d8b8b05450bc9
started: false
lastState: {}
containerID: containerd://a878069fc3ae58b76423d3dffbb4fc57959bb2e68534ce1cb88e468478cfc55a
restartCount: 0
Loading