From 67295386a9385c6a4527da4bdc2f84f2544619f2 Mon Sep 17 00:00:00 2001 From: Alan Clucas Date: Wed, 26 Jun 2024 11:44:25 +0100 Subject: [PATCH] feat: new pod phase counter metric From #12589 This is a new metric counting how many pods have gone into each pod phase as observed by the controller. This is like pods_gauge, but as a counter rather than a gauge. The gauge is useful at telling you what is happening right now in the cluster, but is not useful for long term statistics such as "How many pods has workflows run" because it may never report some pods at all. This counter can answer that question. Note to reviewers: this is part of a stack of reviews for metrics changes. Please don't merge until the rest of the stack is also ready. Signed-off-by: Alan Clucas --- docs/metrics.md | 10 ++++++++++ docs/upgrading.md | 1 + workflow/controller/operator.go | 7 +++++-- workflow/controller/operator_test.go | 2 +- workflow/metrics/counter_pod_phase.go | 25 +++++++++++++++++++++++++ workflow/metrics/labels.go | 4 +++- workflow/metrics/metrics.go | 1 + 7 files changed, 46 insertions(+), 4 deletions(-) create mode 100644 workflow/metrics/counter_pod_phase.go diff --git a/docs/metrics.md b/docs/metrics.md index ffbc8c12b1d0..fc7de31fdea4 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -308,6 +308,16 @@ You should only see this under high load. `recently_started` is controlled by the [environment variable](environment-variables.md) `RECENTLY_STARTED_POD_DURATION` and defaults to 10 seconds. +#### `pods_total_count` + +A gauge of the number of pods which have entered each phase and then observed by the controller. +This is not directly controlled by the workflow controller, so it is possible for some pod phases to be missed. + +| attribute | explanation | +|-------------|-------------------------------------------| +| `phase` | The phase that the pod is in | +| `namespace` | The namespace in which the pod is running | + #### `queue_adds_count` A counter of additions to the work queues inside the controller. diff --git a/docs/upgrading.md b/docs/upgrading.md index 6b972b1807a2..e449fda8863e 100644 --- a/docs/upgrading.md +++ b/docs/upgrading.md @@ -28,6 +28,7 @@ The following are new metrics: * `controller_build_info` * `k8s_request_duration` * `leader` +* `pods_total_count` * `queue_duration` * `queue_longest_running` * `queue_retries` diff --git a/workflow/controller/operator.go b/workflow/controller/operator.go index 8a295f0bca57..ef474323aeb6 100644 --- a/workflow/controller/operator.go +++ b/workflow/controller/operator.go @@ -1137,7 +1137,7 @@ func (woc *wfOperationCtx) podReconciliation(ctx context.Context) (error, bool) defer wfNodesLock.Unlock() node, err := woc.wf.Status.Nodes.Get(nodeID) if err == nil { - if newState := woc.assessNodeStatus(pod, node); newState != nil { + if newState := woc.assessNodeStatus(ctx, pod, node); newState != nil { // Check whether its taskresult is in an incompleted state. if newState.Succeeded() && woc.wf.Status.IsTaskResultIncomplete(node.ID) { woc.log.WithFields(log.Fields{"nodeID": newState.ID}).Debug("Taskresult of the node not yet completed") @@ -1321,7 +1321,7 @@ func printPodSpecLog(pod *apiv1.Pod, wfName string) { // assessNodeStatus compares the current state of a pod with its corresponding node // and returns the new node status if something changed -func (woc *wfOperationCtx) assessNodeStatus(pod *apiv1.Pod, old *wfv1.NodeStatus) *wfv1.NodeStatus { +func (woc *wfOperationCtx) assessNodeStatus(ctx context.Context, pod *apiv1.Pod, old *wfv1.NodeStatus) *wfv1.NodeStatus { new := old.DeepCopy() tmpl, err := woc.GetNodeTemplate(old) if err != nil { @@ -1375,6 +1375,9 @@ func (woc *wfOperationCtx) assessNodeStatus(pod *apiv1.Pod, old *wfv1.NodeStatus new.Phase = wfv1.NodeError new.Message = fmt.Sprintf("Unexpected pod phase for %s: %s", pod.ObjectMeta.Name, pod.Status.Phase) } + if old.Phase != new.Phase { + woc.controller.metrics.ChangePodPhase(ctx, string(new.Phase), pod.ObjectMeta.Namespace) + } // if it's ContainerSetTemplate pod then the inner container names should match to some node names, // in this case need to update nodes according to container status diff --git a/workflow/controller/operator_test.go b/workflow/controller/operator_test.go index 9b471a8cac68..302aeef300e8 100644 --- a/workflow/controller/operator_test.go +++ b/workflow/controller/operator_test.go @@ -1754,7 +1754,7 @@ func TestAssessNodeStatus(t *testing.T) { cancel, controller := newController() defer cancel() woc := newWorkflowOperationCtx(wf, controller) - got := woc.assessNodeStatus(tt.pod, tt.node) + got := woc.assessNodeStatus(context.TODO(), tt.pod, tt.node) assert.Equal(t, tt.want, got.Phase) }) } diff --git a/workflow/metrics/counter_pod_phase.go b/workflow/metrics/counter_pod_phase.go new file mode 100644 index 000000000000..530be09ad5ce --- /dev/null +++ b/workflow/metrics/counter_pod_phase.go @@ -0,0 +1,25 @@ +package metrics + +import ( + "context" +) + +const ( + namePodPhase = `pods_total_count` +) + +func addPodPhaseCounter(_ context.Context, m *Metrics) error { + return m.createInstrument(int64Counter, + namePodPhase, + "Total number of Pods that have entered each phase", + "{pod}", + withAsBuiltIn(), + ) +} + +func (m *Metrics) ChangePodPhase(ctx context.Context, phase, namespace string) { + m.addInt(ctx, namePodPhase, 1, instAttribs{ + {name: labelPodPhase, value: phase}, + {name: labelPodNamespace, value: namespace}, + }) +} diff --git a/workflow/metrics/labels.go b/workflow/metrics/labels.go index 29b9d8855c43..2accb6c9e67e 100644 --- a/workflow/metrics/labels.go +++ b/workflow/metrics/labels.go @@ -16,7 +16,9 @@ const ( labelNodePhase string = `node_phase` - labelPodPhase string = `phase` + labelPodPhase string = `phase` + labelPodNamespace string = `namespace` + labelQueueName string = `queue_name` labelRecentlyStarted string = `recently_started` diff --git a/workflow/metrics/metrics.go b/workflow/metrics/metrics.go index 5445187f5677..de491f1e28dc 100644 --- a/workflow/metrics/metrics.go +++ b/workflow/metrics/metrics.go @@ -96,6 +96,7 @@ func New(ctx context.Context, serviceName string, config *Config, callbacks Call addBuildInfo, addIsLeader, addPodPhaseGauge, + addPodPhaseCounter, addPodMissingCounter, addWorkflowPhaseGauge, addOperationDurationHistogram,