From 9def3b4c63b94e9d87378b383b0d0e1c505ebf27 Mon Sep 17 00:00:00 2001 From: Alan Clucas Date: Thu, 27 Jun 2024 15:08:21 +0100 Subject: [PATCH] feat: new cron workflow trigger counter metric From #12589. A new metric which counts how many times each cron workflow has triggered. A simple enough counter which can be checked against expectations for the cron. Note to reviewers: this is part of a stack of reviews for metrics changes. Please don't merge until the rest of the stack is also ready. Signed-off-by: Alan Clucas --- docs/metrics.md | 9 +++++++ docs/upgrading.md | 1 + test/e2e/metrics_test.go | 16 ++++++++++++ test/e2e/testdata/cronworkflow-metrics.yaml | 19 ++++++++++++++ workflow/cron/operator.go | 1 + .../metrics/counter_cronworkflow_trigger.go | 25 +++++++++++++++++++ workflow/metrics/labels.go | 2 ++ workflow/metrics/metrics.go | 1 + 8 files changed, 74 insertions(+) create mode 100644 test/e2e/testdata/cronworkflow-metrics.yaml create mode 100644 workflow/metrics/counter_cronworkflow_trigger.go diff --git a/docs/metrics.md b/docs/metrics.md index 891e8b3d23b0..57558971ff61 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -201,6 +201,15 @@ Metrics for the [Four Golden Signals](https://sre.google/sre-book/monitoring-dis Some metric attributes may have high cardinality and are marked with ⚠️ to warn you. You may need to disable this metric or disable the attribute. +#### `cronworkflows_triggered_total` + +A counter of the number of times a CronWorkflow has been + +| attribute | explanation | +|-------------|-------------------------------------------| +| `name` | ⚠️ The name of the CronWorkflow. | +| `namespace` | The namespace in which the pod is running | + #### `gauge` A gauge of the number of workflows currently in the cluster in each phase. The `Running` count does not mean that a workflows pods are running, just that the controller has scheduled them. A workflow can be stuck in `Running` with pending pods for a long time. diff --git a/docs/upgrading.md b/docs/upgrading.md index 55f07328f549..6114d80c24c4 100644 --- a/docs/upgrading.md +++ b/docs/upgrading.md @@ -25,6 +25,7 @@ These notes explain the differences in using the Prometheus `/metrics` endpoint The following are new metrics: +* `cronworkflows_triggered_total` * `is_leader` * `k8s_request_duration` * `pods_total_count` diff --git a/test/e2e/metrics_test.go b/test/e2e/metrics_test.go index 3bc6dcc91240..f6af1bf4b954 100644 --- a/test/e2e/metrics_test.go +++ b/test/e2e/metrics_test.go @@ -142,6 +142,22 @@ func (s *MetricsSuite) TestPodPendingMetric() { WaitForWorkflowDeletion() } +func (s *MetricsSuite) TestCronTriggeredCounter() { + s.Given(). + CronWorkflow(`@testdata/cronworkflow-metrics.yaml`). + When(). + CreateCronWorkflow(). + Wait(1 * time.Minute). // This pattern is used in cron_test.go too + Then(). + ExpectCron(func(t *testing.T, cronWf *wfv1.CronWorkflow) { + s.e(s.T()).GET(""). + Expect(). + Status(200). + Body(). + Contains(`cronworkflows_triggered_total{name="test-cron-metric",namespace="argo"} 1`) + }) +} + func TestMetricsSuite(t *testing.T) { suite.Run(t, new(MetricsSuite)) } diff --git a/test/e2e/testdata/cronworkflow-metrics.yaml b/test/e2e/testdata/cronworkflow-metrics.yaml new file mode 100644 index 000000000000..e3fdd9c13e9b --- /dev/null +++ b/test/e2e/testdata/cronworkflow-metrics.yaml @@ -0,0 +1,19 @@ +apiVersion: argoproj.io/v1alpha1 +kind: CronWorkflow +metadata: + name: test-cron-metric +spec: + schedule: "* * * * *" + concurrencyPolicy: "Allow" + startingDeadlineSeconds: 0 + workflowSpec: + metadata: + labels: + workflows.argoproj.io/test: "true" + podGC: + strategy: OnPodCompletion + entrypoint: whalesay + templates: + - name: whalesay + container: + image: argoproj/argosay:v2 diff --git a/workflow/cron/operator.go b/workflow/cron/operator.go index ca5d8e6a0118..d2d3869e5095 100644 --- a/workflow/cron/operator.go +++ b/workflow/cron/operator.go @@ -80,6 +80,7 @@ func (woc *cronWfOperationCtx) run(ctx context.Context, scheduledRuntime time.Ti defer woc.persistUpdate(ctx) woc.log.Infof("Running %s", woc.name) + woc.metrics.CronWfTrigger(ctx, woc.name, woc.cronWf.ObjectMeta.Namespace) // If the cron workflow has a schedule that was just updated, update its annotation if woc.cronWf.IsUsingNewSchedule() { diff --git a/workflow/metrics/counter_cronworkflow_trigger.go b/workflow/metrics/counter_cronworkflow_trigger.go new file mode 100644 index 000000000000..2f1950e4331e --- /dev/null +++ b/workflow/metrics/counter_cronworkflow_trigger.go @@ -0,0 +1,25 @@ +package metrics + +import ( + "context" +) + +const ( + nameCronTriggered = `cronworkflows_triggered_total` +) + +func addCronWfTriggerCounter(_ context.Context, m *Metrics) error { + return m.createInstrument(int64Counter, + nameCronTriggered, + "Total number of cron workflows triggered", + "{cronworkflow}", + withAsBuiltIn(), + ) +} + +func (m *Metrics) CronWfTrigger(ctx context.Context, name, namespace string) { + m.addInt(ctx, nameCronTriggered, 1, instAttribs{ + {name: labelCronWFName, value: name}, + {name: labelWorkflowNamespace, value: namespace}, + }) +} diff --git a/workflow/metrics/labels.go b/workflow/metrics/labels.go index 5094e2044a0a..9a29e692a764 100644 --- a/workflow/metrics/labels.go +++ b/workflow/metrics/labels.go @@ -10,6 +10,8 @@ const ( labelBuildGitTreeState string = `git_treestate` labelBuildGitTag string = `git_tag` + labelCronWFName string = `name` + labelErrorCause string = "cause" labelLogLevel string = `level` diff --git a/workflow/metrics/metrics.go b/workflow/metrics/metrics.go index 58ebbb8ab7bf..e586d13c355f 100644 --- a/workflow/metrics/metrics.go +++ b/workflow/metrics/metrics.go @@ -99,6 +99,7 @@ func New(ctx context.Context, serviceName string, config *Config, callbacks Call addPodMissingCounter, addPodPendingCounter, addWorkflowPhaseGauge, + addCronWfTriggerCounter, addOperationDurationHistogram, addErrorCounter, addLogCounter,