Skip to content

Commit

Permalink
feat: new pod pending counter metric
Browse files Browse the repository at this point in the history
The workflow controller is a kubernetes controller creating
pods. Sometimes those pods do not start, and will remain in pending.

This metric counts the number of pods that may have been observed as
pending, by namespace and truncated reason. The reason is the first
part of the kubernetes pod pending `Reason` up to the first `:` if
present.

It ignores all pods in the `PodInitializing` state as this I consider
unremarkable and temporary.

This is intended for users to create alerts on particular `reasons` or
if this metric is climbing unusually rapidly.

Note to reviewers: this is part of a stack of reviews for metrics
changes. Please don't merge until the rest of the stack is also ready.

Signed-off-by: Alan Clucas <[email protected]>
  • Loading branch information
Joibel committed Aug 16, 2024
1 parent adf98ce commit f6fa6d5
Show file tree
Hide file tree
Showing 8 changed files with 108 additions and 1 deletion.
10 changes: 10 additions & 0 deletions docs/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,16 @@ You should only see this under high load.

`recently_started` is controlled by the [environment variable](environment-variables.md) `RECENTLY_STARTED_POD_DURATION` and defaults to 10 seconds.

#### `pods_total_count`

A gauge of the number of pods which have entered each phase and then observed by the controller.
This is not directly controlled by the workflow controller, so it is possible for some pod phases to be missed.

| attribute | explanation |
|-------------|-------------------------------------------|
| `phase` | The phase that the pod is in |
| `namespace` | The namespace in which the pod is running |

#### `queue_adds_count`

A counter of additions to the work queues inside the controller.
Expand Down
1 change: 1 addition & 0 deletions docs/upgrading.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ These notes explain the differences in using the Prometheus `/metrics` endpoint
The following are new metrics:

* `is_leader`
* `pods_total_count`
* `queue_duration`
* `queue_longest_running`
* `queue_retries`
Expand Down
32 changes: 32 additions & 0 deletions test/e2e/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@ package e2e

import (
"testing"
"time"

"github.com/gavv/httpexpect/v2"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/suite"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

wfv1 "github.com/argoproj/argo-workflows/v3/pkg/apis/workflow/v1alpha1"
Expand Down Expand Up @@ -109,6 +111,36 @@ func (s *MetricsSuite) TestFailedMetric() {
})
}

func (s *MetricsSuite) TestPodPendingMetric() {
s.Given().
Workflow(`@testdata/workflow-pending-metrics.yaml`).
When().
SubmitWorkflow().
WaitForPod(fixtures.PodCondition(func(p *corev1.Pod) bool {
if p.Status.Phase == corev1.PodPending {
for _, cond := range p.Status.Conditions {
if cond.Reason == corev1.PodReasonUnschedulable {
return true
}
}
}
return false
})).
Wait(2 * time.Second). // Hack: We may well observe the pod change faster than the controller
Then().
ExpectWorkflow(func(t *testing.T, metadata *metav1.ObjectMeta, status *wfv1.WorkflowStatus) {
assert.Equal(t, wfv1.WorkflowRunning, status.Phase)
s.e(s.T()).GET("").
Expect().
Status(200).
Body().
Contains(`pod_pending_count{namespace="argo",reason="Unschedulable"} 1`)
}).
When().
DeleteWorkflow().
WaitForWorkflowDeletion()
}

func TestMetricsSuite(t *testing.T) {
suite.Run(t, new(MetricsSuite))
}
22 changes: 22 additions & 0 deletions test/e2e/testdata/workflow-pending-metrics.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
apiVersion: argoproj.io/v1alpha1
kind: Workflow
metadata:
generateName: workflow-pending-metrics-
spec:
entrypoint: main
nodeSelector:
arch: nonexistent
templates:
- name: main
steps:
- - name: runTest
template: run-test
- name: run-test
container:
name: runner
image: 'argoproj/argosay:v2'
args:
- exit 1
command:
- sh
- -c
3 changes: 3 additions & 0 deletions workflow/controller/operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -1360,6 +1360,9 @@ func (woc *wfOperationCtx) assessNodeStatus(pod *apiv1.Pod, old *wfv1.NodeStatus
new.Phase = wfv1.NodePending
new.Message = getPendingReason(pod)
new.Daemoned = nil
if old.Phase != new.Phase || old.Message != new.Message {
woc.controller.metrics.ChangePodPending(ctx, new.Message, pod.ObjectMeta.Namespace)
}
case apiv1.PodSucceeded:
new.Phase = wfv1.NodeSucceeded
new.Daemoned = nil
Expand Down
36 changes: 36 additions & 0 deletions workflow/metrics/counter_pod_pending.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
package metrics

import (
"context"
"strings"
)

const (
namePodPending = `pod_pending_count`
)

func addPodPendingCounter(_ context.Context, m *Metrics) error {
return m.createInstrument(int64Counter,
namePodPending,
"Total number of pods that started pending by reason",
"{pod}",
withAsBuiltIn(),
)
}

func (m *Metrics) ChangePodPending(ctx context.Context, reason, namespace string) {
// Reason strings have a lot of stuff that would result in insane cardinatlity
// so we just take everything from before the first :
splitReason := strings.Split(reason, `:`)
switch splitReason[0] {
case "PodInitializing":
// Drop these, they are uninteresting and usually short
// the pod_phase metric can cope with this being visible
return
default:
m.addInt(ctx, namePodPending, 1, instAttribs{
{name: labelPodPendingReason, value: splitReason[0]},
{name: labelPodNamespace, value: namespace},
})
}
}
4 changes: 3 additions & 1 deletion workflow/metrics/labels.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@ const (

labelNodePhase string = `node_phase`

labelPodPhase string = `phase`
labelPodNamespace string = `namespace`
labelPodPendingReason string = `reason`

labelQueueName string = `queue_name`

labelRecentlyStarted string = `recently_started`
Expand Down
1 change: 1 addition & 0 deletions workflow/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ func New(ctx context.Context, serviceName string, config *Config, callbacks Call
addIsLeader,
addPodPhaseGauge,
addPodMissingCounter,
addPodPendingCounter,
addWorkflowPhaseGauge,
addOperationDurationHistogram,
addErrorCounter,
Expand Down

0 comments on commit f6fa6d5

Please sign in to comment.