From e44a2b9948a6eea8caa712938ac9f539c6f0f63e Mon Sep 17 00:00:00 2001
From: Alan Clucas <alan@clucas.org>
Date: Thu, 27 Jun 2024 14:32:26 +0100
Subject: [PATCH] feat: new pod pending counter metric

The workflow controller is a kubernetes controller creating
pods. Sometimes those pods do not start, and will remain in pending.

This metric counts the number of pods that may have been observed as
pending, by namespace and truncated reason. The reason is the first
part of the kubernetes pod pending `Reason` up to the first `:` if
present.

It ignores all pods in the `PodInitializing` state as this I consider
unremarkable and temporary.

This is intended for users to create alerts on particular `reasons` or
if this metric is climbing unusually rapidly.

Note to reviewers: this is part of a stack of reviews for metrics
changes. Please don't merge until the rest of the stack is also ready.

Signed-off-by: Alan Clucas <alan@clucas.org>
---
 docs/metrics.md                               | 13 +++++++
 docs/upgrading.md                             |  1 +
 test/e2e/metrics_test.go                      | 31 ++++++++++++++++
 .../testdata/workflow-pending-metrics.yaml    | 22 ++++++++++++
 workflow/controller/operator.go               |  3 ++
 workflow/metrics/counter_pod_pending.go       | 36 +++++++++++++++++++
 workflow/metrics/labels.go                    |  5 +--
 workflow/metrics/metrics.go                   |  1 +
 8 files changed, 110 insertions(+), 2 deletions(-)
 create mode 100644 test/e2e/testdata/workflow-pending-metrics.yaml
 create mode 100644 workflow/metrics/counter_pod_pending.go

diff --git a/docs/metrics.md b/docs/metrics.md
index 58f6d9e018c4..57558971ff61 100644
--- a/docs/metrics.md
+++ b/docs/metrics.md
@@ -302,6 +302,19 @@ You should only see this under high load.
 
 `recently_started` is controlled by the [environment variable](environment-variables.md) `RECENTLY_STARTED_POD_DURATION` and defaults to 10 seconds.
 
+#### `pod_pending_count`
+
+A counter of pods that have been seen in the Pending state.
+
+| attribute          | explanation                               |
+|--------------------|-------------------------------------------|
+| `reason` | Summary of the kubernetes Reason for pending.    |
+| `namespace`        | The namespace in which the pod is running |
+
+This metric ignores the `PodInitializing` reason and does not count it.
+The `reason` attribute is the value from the Reason message before the `:` in the message.
+This is not directly controlled by the workflow controller, so it is possible for some pod pending states to be missed.
+
 #### `pods_total_count`
 
 A gauge of the number of pods which have entered each phase and then observed by the controller.
diff --git a/docs/upgrading.md b/docs/upgrading.md
index 76c629bdf7c4..c8af49ec1501 100644
--- a/docs/upgrading.md
+++ b/docs/upgrading.md
@@ -28,6 +28,7 @@ The following are new metrics:
 * `cronworkflows_triggered_total`
 * `is_leader`
 * `k8s_request_duration`
+* `pod_pending_count`
 * `pods_total_count`
 * `queue_duration`
 * `queue_longest_running`
diff --git a/test/e2e/metrics_test.go b/test/e2e/metrics_test.go
index 295c728ef4be..929de4636cc9 100644
--- a/test/e2e/metrics_test.go
+++ b/test/e2e/metrics_test.go
@@ -9,6 +9,7 @@ import (
 	"github.com/gavv/httpexpect/v2"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/suite"
+	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 
 	wfv1 "github.com/argoproj/argo-workflows/v3/pkg/apis/workflow/v1alpha1"
@@ -126,6 +127,36 @@ func (s *MetricsSuite) TestCronTriggeredCounter() {
 		})
 }
 
+func (s *MetricsSuite) TestPodPendingMetric() {
+	s.Given().
+		Workflow(`@testdata/workflow-pending-metrics.yaml`).
+		When().
+		SubmitWorkflow().
+		WaitForPod(fixtures.PodCondition(func(p *corev1.Pod) bool {
+			if p.Status.Phase == corev1.PodPending {
+				for _, cond := range p.Status.Conditions {
+					if cond.Reason == corev1.PodReasonUnschedulable {
+						return true
+					}
+				}
+			}
+			return false
+		})).
+		Wait(2 * time.Second). // Hack: We may well observe the pod change faster than the controller
+		Then().
+		ExpectWorkflow(func(t *testing.T, metadata *metav1.ObjectMeta, status *wfv1.WorkflowStatus) {
+			assert.Equal(t, wfv1.WorkflowRunning, status.Phase)
+			s.e(s.T()).GET("").
+				Expect().
+				Status(200).
+				Body().
+				Contains(`pod_pending_count{namespace="argo",reason="Unschedulable"} 1`)
+		}).
+		When().
+		DeleteWorkflow().
+		WaitForWorkflowDeletion()
+}
+
 func TestMetricsSuite(t *testing.T) {
 	suite.Run(t, new(MetricsSuite))
 }
diff --git a/test/e2e/testdata/workflow-pending-metrics.yaml b/test/e2e/testdata/workflow-pending-metrics.yaml
new file mode 100644
index 000000000000..f38057e06e83
--- /dev/null
+++ b/test/e2e/testdata/workflow-pending-metrics.yaml
@@ -0,0 +1,22 @@
+apiVersion: argoproj.io/v1alpha1
+kind: Workflow
+metadata:
+  generateName: workflow-pending-metrics-
+spec:
+  entrypoint: main
+  nodeSelector:
+    arch: nonexistent
+  templates:
+    - name: main
+      steps:
+        - - name: runTest
+            template: run-test
+    - name: run-test
+      container:
+        name: runner
+        image: 'argoproj/argosay:v2'
+        args:
+          - exit 1
+        command:
+          - sh
+          - -c
diff --git a/workflow/controller/operator.go b/workflow/controller/operator.go
index b01e8d0bc2d8..cbe9eed7fdc0 100644
--- a/workflow/controller/operator.go
+++ b/workflow/controller/operator.go
@@ -1360,6 +1360,9 @@ func (woc *wfOperationCtx) assessNodeStatus(ctx context.Context, pod *apiv1.Pod,
 		new.Phase = wfv1.NodePending
 		new.Message = getPendingReason(pod)
 		new.Daemoned = nil
+		if old.Phase != new.Phase || old.Message != new.Message {
+			woc.controller.metrics.ChangePodPending(ctx, new.Message, pod.ObjectMeta.Namespace)
+		}
 	case apiv1.PodSucceeded:
 		new.Phase = wfv1.NodeSucceeded
 		new.Daemoned = nil
diff --git a/workflow/metrics/counter_pod_pending.go b/workflow/metrics/counter_pod_pending.go
new file mode 100644
index 000000000000..4c47fbb4a22f
--- /dev/null
+++ b/workflow/metrics/counter_pod_pending.go
@@ -0,0 +1,36 @@
+package metrics
+
+import (
+	"context"
+	"strings"
+)
+
+const (
+	namePodPending = `pod_pending_count`
+)
+
+func addPodPendingCounter(_ context.Context, m *Metrics) error {
+	return m.createInstrument(int64Counter,
+		namePodPending,
+		"Total number of pods that started pending by reason",
+		"{pod}",
+		withAsBuiltIn(),
+	)
+}
+
+func (m *Metrics) ChangePodPending(ctx context.Context, reason, namespace string) {
+	// Reason strings have a lot of stuff that would result in insane cardinatlity
+	// so we just take everything from before the first :
+	splitReason := strings.Split(reason, `:`)
+	switch splitReason[0] {
+	case "PodInitializing":
+		// Drop these, they are uninteresting and usually short
+		// the pod_phase metric can cope with this being visible
+		return
+	default:
+		m.addInt(ctx, namePodPending, 1, instAttribs{
+			{name: labelPodPendingReason, value: splitReason[0]},
+			{name: labelPodNamespace, value: namespace},
+		})
+	}
+}
diff --git a/workflow/metrics/labels.go b/workflow/metrics/labels.go
index 5270236823ba..9a29e692a764 100644
--- a/workflow/metrics/labels.go
+++ b/workflow/metrics/labels.go
@@ -18,8 +18,9 @@ const (
 
 	labelNodePhase string = `node_phase`
 
-	labelPodPhase     string = `phase`
-	labelPodNamespace string = `namespace`
+	labelPodPhase         string = `phase`
+	labelPodNamespace     string = `namespace`
+	labelPodPendingReason string = `reason`
 
 	labelQueueName string = `queue_name`
 
diff --git a/workflow/metrics/metrics.go b/workflow/metrics/metrics.go
index a61cea655292..e586d13c355f 100644
--- a/workflow/metrics/metrics.go
+++ b/workflow/metrics/metrics.go
@@ -97,6 +97,7 @@ func New(ctx context.Context, serviceName string, config *Config, callbacks Call
 		addPodPhaseGauge,
 		addPodPhaseCounter,
 		addPodMissingCounter,
+		addPodPendingCounter,
 		addWorkflowPhaseGauge,
 		addCronWfTriggerCounter,
 		addOperationDurationHistogram,