Skip to content

Commit

Permalink
feat: new pod pending counter metric
Browse files Browse the repository at this point in the history
The workflow controller is a kubernetes controller creating
pods. Sometimes those pods do not start, and will remain in pending.

This metric counts the number of pods that may have been observed as
pending, by namespace and truncated reason. The reason is the first
part of the kubernetes pod pending `Reason` up to the first `:` if
present.

It ignores all pods in the `PodInitializing` state as this I consider
unremarkable and temporary.

This is intended for users to create alerts on particular `reasons` or
if this metric is climbing unusually rapidly.

Note to reviewers: this is part of a stack of reviews for metrics
changes. Please don't merge until the rest of the stack is also ready.

Signed-off-by: Alan Clucas <[email protected]>
  • Loading branch information
Joibel committed Aug 15, 2024
1 parent b09c0d2 commit 69dfcca
Show file tree
Hide file tree
Showing 8 changed files with 111 additions and 2 deletions.
13 changes: 13 additions & 0 deletions docs/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,19 @@ You should only see this under high load.

`recently_started` is controlled by the [environment variable](environment-variables.md) `RECENTLY_STARTED_POD_DURATION` and defaults to 10 seconds.

#### `pod_pending_count`

A counter of pods that have been seen in the Pending state.

| attribute | explanation |
|--------------------|-------------------------------------------|
| `reason` | Summary of the kubernetes Reason for pending. |
| `namespace` | The namespace in which the pod is running |

This metric ignores the `PodInitializing` reason and does not count it.
The `reason` attribute is the value from the Reason message before the `:` in the message.
This is not directly controlled by the workflow controller, so it is possible for some pod pending states to be missed.

#### `pods_total_count`

A gauge of the number of pods which have entered each phase and then observed by the controller.
Expand Down
1 change: 1 addition & 0 deletions docs/upgrading.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ The following are new metrics:
* `is_leader`
* `k8s_request_duration`
* `pods_total_count`
* `pod_pending_count`
* `queue_duration`
* `queue_longest_running`
* `queue_retries`
Expand Down
32 changes: 32 additions & 0 deletions test/e2e/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@ package e2e

import (
"testing"
"time"

"github.com/gavv/httpexpect/v2"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/suite"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

wfv1 "github.com/argoproj/argo-workflows/v3/pkg/apis/workflow/v1alpha1"
Expand Down Expand Up @@ -109,6 +111,36 @@ func (s *MetricsSuite) TestFailedMetric() {
})
}

func (s *MetricsSuite) TestPodPendingMetric() {
s.Given().
Workflow(`@testdata/workflow-pending-metrics.yaml`).
When().
SubmitWorkflow().
WaitForPod(fixtures.PodCondition(func(p *corev1.Pod) bool {
if p.Status.Phase == corev1.PodPending {
for _, cond := range p.Status.Conditions {
if cond.Reason == corev1.PodReasonUnschedulable {
return true
}
}
}
return false
})).
Wait(2 * time.Second). // Hack: We may well observe the pod change faster than the controller
Then().
ExpectWorkflow(func(t *testing.T, metadata *metav1.ObjectMeta, status *wfv1.WorkflowStatus) {
assert.Equal(t, wfv1.WorkflowRunning, status.Phase)
s.e(s.T()).GET("").
Expect().
Status(200).
Body().
Contains(`pod_pending_count{namespace="argo",reason="Unschedulable"} 1`)
}).
When().
DeleteWorkflow().
WaitForWorkflowDeletion()
}

func TestMetricsSuite(t *testing.T) {
suite.Run(t, new(MetricsSuite))
}
22 changes: 22 additions & 0 deletions test/e2e/testdata/workflow-pending-metrics.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
apiVersion: argoproj.io/v1alpha1
kind: Workflow
metadata:
generateName: workflow-pending-metrics-
spec:
entrypoint: main
nodeSelector:
arch: nonexistent
templates:
- name: main
steps:
- - name: runTest
template: run-test
- name: run-test
container:
name: runner
image: 'argoproj/argosay:v2'
args:
- exit 1
command:
- sh
- -c
3 changes: 3 additions & 0 deletions workflow/controller/operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -1360,6 +1360,9 @@ func (woc *wfOperationCtx) assessNodeStatus(ctx context.Context, pod *apiv1.Pod,
new.Phase = wfv1.NodePending
new.Message = getPendingReason(pod)
new.Daemoned = nil
if old.Phase != new.Phase || old.Message != new.Message {
woc.controller.metrics.ChangePodPending(ctx, new.Message, pod.ObjectMeta.Namespace)
}
case apiv1.PodSucceeded:
new.Phase = wfv1.NodeSucceeded
new.Daemoned = nil
Expand Down
36 changes: 36 additions & 0 deletions workflow/metrics/counter_pod_pending.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
package metrics

import (
"context"
"strings"
)

const (
namePodPending = `pod_pending_count`
)

func addPodPendingCounter(_ context.Context, m *Metrics) error {
return m.createInstrument(int64Counter,
namePodPending,
"Total number of pods that started pending by reason",
"{pod}",
withAsBuiltIn(),
)
}

func (m *Metrics) ChangePodPending(ctx context.Context, reason, namespace string) {
// Reason strings have a lot of stuff that would result in insane cardinatlity
// so we just take everything from before the first :
splitReason := strings.Split(reason, `:`)
switch splitReason[0] {
case "PodInitializing":
// Drop these, they are uninteresting and usually short
// the pod_phase metric can cope with this being visible
return
default:
m.addInt(ctx, namePodPending, 1, instAttribs{
{name: labelPodPendingReason, value: splitReason[0]},
{name: labelPodNamespace, value: namespace},
})
}
}
5 changes: 3 additions & 2 deletions workflow/metrics/labels.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,9 @@ const (

labelNodePhase string = `node_phase`

labelPodPhase string = `phase`
labelPodNamespace string = `namespace`
labelPodPhase string = `phase`
labelPodNamespace string = `namespace`
labelPodPendingReason string = `reason`

labelQueueName string = `queue_name`

Expand Down
1 change: 1 addition & 0 deletions workflow/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ func New(ctx context.Context, serviceName string, config *Config, callbacks Call
addPodPhaseGauge,
addPodPhaseCounter,
addPodMissingCounter,
addPodPendingCounter,
addWorkflowPhaseGauge,
addOperationDurationHistogram,
addErrorCounter,
Expand Down

0 comments on commit 69dfcca

Please sign in to comment.