From dc7479a9d168e029ebd3950a8078929146dd61e4 Mon Sep 17 00:00:00 2001 From: Alan Clucas Date: Thu, 27 Jun 2024 15:08:21 +0100 Subject: [PATCH] feat: new cron workflow trigger counter metric From #12589. A new metric which counts how many times each cron workflow has triggered. A simple enough counter which can be checked against expectations for the cron. Note to reviewers: this is part of a stack of reviews for metrics changes. Please don't merge until the rest of the stack is also ready. Signed-off-by: Alan Clucas --- docs/metrics.md | 9 +++++++ docs/upgrading.md | 1 + test/e2e/metrics_test.go | 16 ++++++++++++ test/e2e/testdata/cronworkflow-metrics.yaml | 19 ++++++++++++++ workflow/cron/operator.go | 1 + .../metrics/counter_cronworkflow_trigger.go | 25 +++++++++++++++++++ workflow/metrics/labels.go | 2 ++ workflow/metrics/metrics.go | 1 + 8 files changed, 74 insertions(+) create mode 100644 test/e2e/testdata/cronworkflow-metrics.yaml create mode 100644 workflow/metrics/counter_cronworkflow_trigger.go diff --git a/docs/metrics.md b/docs/metrics.md index f91dd272fd9da..e802fc2e0b925 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -216,6 +216,15 @@ The build information for this workflow controller | `treestate` | Whether the git tree was `dirty` or `clean` when this was built | | `tag` | The tag on the git commit or `untagged` if it was not tagged | +#### `cronworkflows_triggered_total` + +A counter of the number of times a CronWorkflow has been + +| attribute | explanation | +|-------------|-------------------------------------------| +| `name` | ⚠️ The name of the CronWorkflow. | +| `namespace` | The namespace in which the pod is running | + #### `gauge` A gauge of the number of workflows currently in the cluster in each phase. The `Running` count does not mean that a workflows pods are running, just that the controller has scheduled them. A workflow can be stuck in `Running` with pending pods for a long time. diff --git a/docs/upgrading.md b/docs/upgrading.md index bb7c86b188df7..50e1444a94f4a 100644 --- a/docs/upgrading.md +++ b/docs/upgrading.md @@ -26,6 +26,7 @@ These notes explain the differences in using the Prometheus `/metrics` endpoint The following are new metrics: * `controller_build_info` +* `cronworkflows_triggered_total` * `k8s_request_duration` * `leader` * `pods_total_count` diff --git a/test/e2e/metrics_test.go b/test/e2e/metrics_test.go index 3bc6dcc91240d..f6af1bf4b954c 100644 --- a/test/e2e/metrics_test.go +++ b/test/e2e/metrics_test.go @@ -142,6 +142,22 @@ func (s *MetricsSuite) TestPodPendingMetric() { WaitForWorkflowDeletion() } +func (s *MetricsSuite) TestCronTriggeredCounter() { + s.Given(). + CronWorkflow(`@testdata/cronworkflow-metrics.yaml`). + When(). + CreateCronWorkflow(). + Wait(1 * time.Minute). // This pattern is used in cron_test.go too + Then(). + ExpectCron(func(t *testing.T, cronWf *wfv1.CronWorkflow) { + s.e(s.T()).GET(""). + Expect(). + Status(200). + Body(). + Contains(`cronworkflows_triggered_total{name="test-cron-metric",namespace="argo"} 1`) + }) +} + func TestMetricsSuite(t *testing.T) { suite.Run(t, new(MetricsSuite)) } diff --git a/test/e2e/testdata/cronworkflow-metrics.yaml b/test/e2e/testdata/cronworkflow-metrics.yaml new file mode 100644 index 0000000000000..e3fdd9c13e9b7 --- /dev/null +++ b/test/e2e/testdata/cronworkflow-metrics.yaml @@ -0,0 +1,19 @@ +apiVersion: argoproj.io/v1alpha1 +kind: CronWorkflow +metadata: + name: test-cron-metric +spec: + schedule: "* * * * *" + concurrencyPolicy: "Allow" + startingDeadlineSeconds: 0 + workflowSpec: + metadata: + labels: + workflows.argoproj.io/test: "true" + podGC: + strategy: OnPodCompletion + entrypoint: whalesay + templates: + - name: whalesay + container: + image: argoproj/argosay:v2 diff --git a/workflow/cron/operator.go b/workflow/cron/operator.go index ca5d8e6a01180..d2d3869e5095a 100644 --- a/workflow/cron/operator.go +++ b/workflow/cron/operator.go @@ -80,6 +80,7 @@ func (woc *cronWfOperationCtx) run(ctx context.Context, scheduledRuntime time.Ti defer woc.persistUpdate(ctx) woc.log.Infof("Running %s", woc.name) + woc.metrics.CronWfTrigger(ctx, woc.name, woc.cronWf.ObjectMeta.Namespace) // If the cron workflow has a schedule that was just updated, update its annotation if woc.cronWf.IsUsingNewSchedule() { diff --git a/workflow/metrics/counter_cronworkflow_trigger.go b/workflow/metrics/counter_cronworkflow_trigger.go new file mode 100644 index 0000000000000..2f1950e4331e1 --- /dev/null +++ b/workflow/metrics/counter_cronworkflow_trigger.go @@ -0,0 +1,25 @@ +package metrics + +import ( + "context" +) + +const ( + nameCronTriggered = `cronworkflows_triggered_total` +) + +func addCronWfTriggerCounter(_ context.Context, m *Metrics) error { + return m.createInstrument(int64Counter, + nameCronTriggered, + "Total number of cron workflows triggered", + "{cronworkflow}", + withAsBuiltIn(), + ) +} + +func (m *Metrics) CronWfTrigger(ctx context.Context, name, namespace string) { + m.addInt(ctx, nameCronTriggered, 1, instAttribs{ + {name: labelCronWFName, value: name}, + {name: labelWorkflowNamespace, value: namespace}, + }) +} diff --git a/workflow/metrics/labels.go b/workflow/metrics/labels.go index 9652408e4a16b..8678e7d41a648 100644 --- a/workflow/metrics/labels.go +++ b/workflow/metrics/labels.go @@ -10,6 +10,8 @@ const ( labelBuildGitTreeState string = `treestate` labelBuildGitTag string = `tag` + labelCronWFName string = `name` + labelErrorCause string = "cause" labelLogLevel string = `level` diff --git a/workflow/metrics/metrics.go b/workflow/metrics/metrics.go index 3f74d05de3fe4..ca0ad92e89d4e 100644 --- a/workflow/metrics/metrics.go +++ b/workflow/metrics/metrics.go @@ -100,6 +100,7 @@ func New(ctx context.Context, serviceName string, config *Config, callbacks Call addPodMissingCounter, addPodPendingCounter, addWorkflowPhaseGauge, + addCronWfTriggerCounter, addOperationDurationHistogram, addErrorCounter, addLogCounter,