diff --git a/docs/metrics.md b/docs/metrics.md index 1b938f269f865..891e8b3d23b09 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -293,6 +293,19 @@ You should only see this under high load. `recently_started` is controlled by the [environment variable](environment-variables.md) `RECENTLY_STARTED_POD_DURATION` and defaults to 10 seconds. +#### `pod_pending_count` + +A counter of pods that have been seen in the Pending state. + +| attribute | explanation | +|--------------------|-------------------------------------------| +| `reason` | Summary of the kubernetes Reason for pending. | +| `namespace` | The namespace in which the pod is running | + +This metric ignores the `PodInitializing` reason and does not count it. +The `reason` attribute is the value from the Reason message before the `:` in the message. +This is not directly controlled by the workflow controller, so it is possible for some pod pending states to be missed. + #### `pods_total_count` A gauge of the number of pods which have entered each phase and then observed by the controller. diff --git a/docs/upgrading.md b/docs/upgrading.md index a1b9395656dbf..3b4b4a3d010c1 100644 --- a/docs/upgrading.md +++ b/docs/upgrading.md @@ -28,6 +28,7 @@ The following are new metrics: * `is_leader` * `k8s_request_duration` * `pods_total_count` +* `pod_pending_count` * `queue_duration` * `queue_longest_running` * `queue_retries` diff --git a/test/e2e/metrics_test.go b/test/e2e/metrics_test.go index 4f3b3e012fcf4..9cd62090faa73 100644 --- a/test/e2e/metrics_test.go +++ b/test/e2e/metrics_test.go @@ -4,10 +4,12 @@ package e2e import ( "testing" + "time" "github.com/gavv/httpexpect/v2" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/suite" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" wfv1 "github.com/argoproj/argo-workflows/v3/pkg/apis/workflow/v1alpha1" @@ -109,6 +111,36 @@ func (s *MetricsSuite) TestFailedMetric() { }) } +func (s *MetricsSuite) TestPodPendingMetric() { + s.Given(). + Workflow(`@testdata/workflow-pending-metrics.yaml`). + When(). + SubmitWorkflow(). + WaitForPod(fixtures.PodCondition(func(p *corev1.Pod) bool { + if p.Status.Phase == corev1.PodPending { + for _, cond := range p.Status.Conditions { + if cond.Reason == corev1.PodReasonUnschedulable { + return true + } + } + } + return false + })). + Wait(2 * time.Second). // Hack: We may well observe the pod change faster than the controller + Then(). + ExpectWorkflow(func(t *testing.T, metadata *metav1.ObjectMeta, status *wfv1.WorkflowStatus) { + assert.Equal(t, wfv1.WorkflowRunning, status.Phase) + s.e(s.T()).GET(""). + Expect(). + Status(200). + Body(). + Contains(`pod_pending_count{namespace="argo",reason="Unschedulable"} 1`) + }). + When(). + DeleteWorkflow(). + WaitForWorkflowDeletion() +} + func TestMetricsSuite(t *testing.T) { suite.Run(t, new(MetricsSuite)) } diff --git a/test/e2e/testdata/workflow-pending-metrics.yaml b/test/e2e/testdata/workflow-pending-metrics.yaml new file mode 100644 index 0000000000000..f38057e06e832 --- /dev/null +++ b/test/e2e/testdata/workflow-pending-metrics.yaml @@ -0,0 +1,22 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Workflow +metadata: + generateName: workflow-pending-metrics- +spec: + entrypoint: main + nodeSelector: + arch: nonexistent + templates: + - name: main + steps: + - - name: runTest + template: run-test + - name: run-test + container: + name: runner + image: 'argoproj/argosay:v2' + args: + - exit 1 + command: + - sh + - -c diff --git a/workflow/controller/operator.go b/workflow/controller/operator.go index b01e8d0bc2d83..cbe9eed7fdc0d 100644 --- a/workflow/controller/operator.go +++ b/workflow/controller/operator.go @@ -1360,6 +1360,9 @@ func (woc *wfOperationCtx) assessNodeStatus(ctx context.Context, pod *apiv1.Pod, new.Phase = wfv1.NodePending new.Message = getPendingReason(pod) new.Daemoned = nil + if old.Phase != new.Phase || old.Message != new.Message { + woc.controller.metrics.ChangePodPending(ctx, new.Message, pod.ObjectMeta.Namespace) + } case apiv1.PodSucceeded: new.Phase = wfv1.NodeSucceeded new.Daemoned = nil diff --git a/workflow/metrics/counter_pod_pending.go b/workflow/metrics/counter_pod_pending.go new file mode 100644 index 0000000000000..4c47fbb4a22f8 --- /dev/null +++ b/workflow/metrics/counter_pod_pending.go @@ -0,0 +1,36 @@ +package metrics + +import ( + "context" + "strings" +) + +const ( + namePodPending = `pod_pending_count` +) + +func addPodPendingCounter(_ context.Context, m *Metrics) error { + return m.createInstrument(int64Counter, + namePodPending, + "Total number of pods that started pending by reason", + "{pod}", + withAsBuiltIn(), + ) +} + +func (m *Metrics) ChangePodPending(ctx context.Context, reason, namespace string) { + // Reason strings have a lot of stuff that would result in insane cardinatlity + // so we just take everything from before the first : + splitReason := strings.Split(reason, `:`) + switch splitReason[0] { + case "PodInitializing": + // Drop these, they are uninteresting and usually short + // the pod_phase metric can cope with this being visible + return + default: + m.addInt(ctx, namePodPending, 1, instAttribs{ + {name: labelPodPendingReason, value: splitReason[0]}, + {name: labelPodNamespace, value: namespace}, + }) + } +} diff --git a/workflow/metrics/labels.go b/workflow/metrics/labels.go index 9e33432d604fa..5094e2044a0a9 100644 --- a/workflow/metrics/labels.go +++ b/workflow/metrics/labels.go @@ -16,8 +16,9 @@ const ( labelNodePhase string = `node_phase` - labelPodPhase string = `phase` - labelPodNamespace string = `namespace` + labelPodPhase string = `phase` + labelPodNamespace string = `namespace` + labelPodPendingReason string = `reason` labelQueueName string = `queue_name` diff --git a/workflow/metrics/metrics.go b/workflow/metrics/metrics.go index 5361bda8388e6..58ebbb8ab7bf8 100644 --- a/workflow/metrics/metrics.go +++ b/workflow/metrics/metrics.go @@ -97,6 +97,7 @@ func New(ctx context.Context, serviceName string, config *Config, callbacks Call addPodPhaseGauge, addPodPhaseCounter, addPodMissingCounter, + addPodPendingCounter, addWorkflowPhaseGauge, addOperationDurationHistogram, addErrorCounter,