From a71b9137489f615d670c33977ffb177fb9a4a7a4 Mon Sep 17 00:00:00 2001 From: Alan Clucas Date: Mon, 25 Nov 2024 13:59:47 +0000 Subject: [PATCH] refactor: autogenerate metrics docs Signed-off-by: Alan Clucas --- Makefile | 8 +- docs/metrics.md | 338 +++++++++----- util/telemetry/attributes.go | 58 +-- util/telemetry/builder/builder.go | 342 ++++++++++++++ util/telemetry/builder/values.yaml | 406 +++++++++++++++++ util/telemetry/builtinInstrument.go | 37 ++ util/telemetry/counter_deprecations.go | 13 +- util/telemetry/metrics_list.go | 420 ++++++++++++++++++ util/telemetry/version.go | 10 +- .../metrics/counter_cronworkflow_policy.go | 15 +- .../metrics/counter_cronworkflow_trigger.go | 15 +- workflow/metrics/counter_error.go | 16 +- workflow/metrics/counter_log.go | 13 +- workflow/metrics/counter_pod_missing.go | 13 +- workflow/metrics/counter_pod_pending.go | 13 +- workflow/metrics/counter_pod_phase.go | 13 +- workflow/metrics/counter_template.go | 13 +- workflow/metrics/counter_workflow_phase.go | 13 +- workflow/metrics/gauge_pod_phase.go | 13 +- workflow/metrics/gauge_workflow_condition.go | 12 +- workflow/metrics/gauge_workflow_phase.go | 13 +- workflow/metrics/histogram_durations.go | 12 +- workflow/metrics/histogram_template.go | 13 +- workflow/metrics/leader.go | 13 +- workflow/metrics/leader_test.go | 4 +- workflow/metrics/metrics_k8s_request.go | 24 +- workflow/metrics/metrics_test.go | 2 +- workflow/metrics/work_queue.go | 98 +--- 28 files changed, 1529 insertions(+), 431 deletions(-) create mode 100644 util/telemetry/builder/builder.go create mode 100644 util/telemetry/builder/values.yaml create mode 100644 util/telemetry/builtinInstrument.go create mode 100644 util/telemetry/metrics_list.go diff --git a/Makefile b/Makefile index 488ba239c079..345e941d8c59 100644 --- a/Makefile +++ b/Makefile @@ -471,7 +471,7 @@ lint: server/static/files.go $(GOPATH)/bin/golangci-lint # for local we have a faster target that prints to stdout, does not use json, and can cache because it has no coverage .PHONY: test -test: server/static/files.go +test: server/static/files.go util/telemetry/metrics_list.go util/telemetry/attributes.go go build ./... env KUBECONFIG=/dev/null $(GOTEST) ./... # marker file, based on it's modification time, we know how long ago this target was run @@ -632,8 +632,12 @@ clean: go clean rm -Rf test-results node_modules vendor v2 v3 argoexec-linux-amd64 dist/* ui/dist -# swagger +# Built telemetry files +util/telemetry/metrics_list.go util/telemetry/attributes.go docs/metrics.md: util/telemetry/builder/builder.go util/telemetry/builder/values.yaml + @echo Rebuilding telemetry files + cd util/telemetry/builder && go run . +# swagger pkg/apis/workflow/v1alpha1/openapi_generated.go: $(GOPATH)/bin/openapi-gen $(TYPES) # These files are generated on a v3/ folder by the tool. Link them to the root folder [ -e ./v3 ] || ln -s . v3 diff --git a/docs/metrics.md b/docs/metrics.md index a1cd4adb586f..2d00f5041663 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -242,38 +242,38 @@ Metrics for the [Four Golden Signals](https://sre.google/sre-book/monitoring-dis !!! Warning "High cardinality" Some metric attributes may have high cardinality and are marked with ⚠️ to warn you. You may need to disable this metric or disable the attribute. - - + #### `cronworkflows_concurrencypolicy_triggered` A counter of the number of times a CronWorkflow has triggered its `concurrencyPolicy` to limit the number of workflows running. -| attribute | explanation | -|-------------|-------------------------------------------| -| `name` | ⚠️ The name of the CronWorkflow | +| attribute | explanation | +|-----------|-------------| +| `name` | ⚠️ The name of the CronWorkflow | | `namespace` | The namespace of the CronWorkflow | | `concurrency_policy` | The concurrency policy which was triggered, will be either `Forbid` or `Replace` | #### `cronworkflows_triggered_total` -A counter of the number of times a CronWorkflow has been triggered. +A counter of the total number of times a CronWorkflow has been triggered Suppressed runs due to `concurrencyPolicy: Forbid` will not be counted. -| attribute | explanation | -|-------------|-------------------------------------------| -| `name` | ⚠️ The name of the CronWorkflow | +| attribute | explanation | +|-----------|-------------| +| `name` | ⚠️ The name of the CronWorkflow | | `namespace` | The namespace of the CronWorkflow | #### `deprecated_feature` -A counter which goes up when a feature which is [deprecated](deprecations.md) is used. +Incidents of deprecated feature being used. +Deprecated features are [explained here](deprecations.md). 🚨 This counter may go up much more than once for a single use of the feature. -| attribute | explanation | -|-------------|---------------------------------------------| -| `feature` | The name of the feature used | -| `namespace` | The namespace of the item using the feature | +| attribute | explanation | +|-----------|-------------| +| `feature` | The name of the feature used | +| `namespace` | The namespace in which the workflow is running | `feature` will be one of: @@ -282,129 +282,136 @@ A counter which goes up when a feature which is [deprecated](deprecations.md) is - [`synchronization semaphore`](deprecations.md#synchronization_semaphore) - [`workflow podpriority`](deprecations.md#workflow_podpriority) -#### `gauge` - -A gauge of the number of workflows currently in the cluster in each phase. The `Running` count does not mean that a workflows pods are running, just that the controller has scheduled them. A workflow can be stuck in `Running` with pending pods for a long time. - -| attribute | explanation | -|-----------|-----------------------------------| -| `status` | The phase that the workflow is in | - #### `error_count` -A counter of certain errors incurred by the controller. +A counter of certain errors incurred by the controller by cause. -| attribute | explanation | -|-----------|------------------------| -| `cause` | The cause of the error | +| attribute | explanation | +|-----------|-------------| +| `cause` | The cause of the error | The currently tracked specific errors are -- `OperationPanic` - the controller `panic()` on a programming bug +- `OperationPanic` - the controller called `panic()` on encounterin a programming bug - `CronWorkflowSubmissionError` - A CronWorkflow failed submission - `CronWorkflowSpecError` - A CronWorkflow has an invalid specification -#### `k8s_request_total` +#### `gauge` -A counter of the number of API requests sent to the Kubernetes API. +A gauge of the number of workflows currently in the cluster in each phase +The `Running` count does not mean that a workflows pods are running, just that the controller has scheduled them. +A workflow can be stuck in `Running` with pending pods for a long time. -| attribute | explanation | -|---------------|--------------------------------------------------------------------| -| `kind` | The kubernetes `kind` involved in the request such as `configmaps` | -| `verb` | The verb of the request, such as `Get` or `List` | -| `status_code` | The HTTP status code of the response | +| attribute | explanation | +|-----------|-------------| +| `status` | Boolean: `true` or `false` | -This metric is calculable from `k8s_request_duration`, and it is suggested you just collect that metric instead. +#### `is_leader` + +Emits 1 if leader, 0 otherwise. Always 1 if leader election is disabled +A gauge indicating if this Controller is the [leader](high-availability.md#workflow-controller). + +- `1` if leader or in standalone mode via [`LEADER_ELECTION_DISABLE=true`](environment-variables.md#controller). +- `0` otherwise, indicating that this controller is a standby that is not currently running workflows. + +This metric has no attributes. #### `k8s_request_duration` -A histogram recording how long each type of request took. +A histogram recording the API requests sent to the Kubernetes API. -| attribute | explanation | -|---------------|--------------------------------------------------------------------| -| `kind` | The kubernetes `kind` involved in the request such as `configmaps` | -| `verb` | The verb of the request, such as `Get` or `List` | -| `status_code` | The HTTP status code of the response | +| attribute | explanation | +|-----------|-------------| +| `kind` | The kubernetes `kind` involved in the request such as `configmaps` | +| `verb` | The verb of the request, such as `Get` or `List` | +| `status_code` | The HTTP status code of the response | -This is contains all the information contained in `k8s_request_total` along with timings. +Default bucket sizes: 0.1, 0.2, 0.5, 1, 2, 5, 10, 20, 60, 180 +This contains all the information contained in `k8s_request_total` along with timings. -#### `is_leader` +#### `k8s_request_total` -A gauge indicating if this Controller is the [leader](high-availability.md#workflow-controller). +A counter of the number of API requests sent to the Kubernetes API. -- `1` if leader or in standalone mode via [`LEADER_ELECTION_DISABLE=true`](environment-variables.md#controller). -- `0` otherwise, indicating that this controller is a standby that is not currently running workflows. +| attribute | explanation | +|-----------|-------------| +| `kind` | The kubernetes `kind` involved in the request such as `configmaps` | +| `verb` | The verb of the request, such as `Get` or `List` | +| `status_code` | The HTTP status code of the response | + +This metric is calculable from `k8s_request_duration`, and it is suggested you just collect that metric instead. #### `log_messages` -A count of log messages emitted by the controller by log level: `error`, `warn` and `info`. +A count of log messages emitted by the controller by log level: `error`, `warn` and `info` -| attribute | explanation | -|-----------|------------------------------| -| `level` | The log level of the message | +| attribute | explanation | +|-----------|-------------| +| `level` | The log level of the message | #### `operation_duration_seconds` -A histogram of durations of operations. An operation is a single workflow reconciliation loop within the workflow-controller. +A histogram of durations of operations +An operation is a single workflow reconciliation loop within the workflow-controller. It's the time for the controller to process a single workflow after it has been read from the cluster and is a measure of the performance of the controller affected by the complexity of the workflow. This metric has no attributes. The environment variables `OPERATION_DURATION_METRIC_BUCKET_COUNT` and `MAX_OPERATION_TIME` configure the bucket sizes for this metric, unless they are specified using an `histogramBuckets` modifier in the `metricsConfig` block. -#### `pods_gauge` - -A gauge of the number of workflow created pods currently in the cluster in each phase. -It is possible for a workflow to start, but no pods be running (for example cluster is too busy to run them). -This metric sheds light on actual work being done. - -| attribute | explanation | -|-----------|------------------------------| -| `phase` | The phase that the pod is in | - #### `pod_missing` +Incidents of pod missing. A counter of pods that were not seen - for example they are by being deleted by Kubernetes. You should only see this under high load. -| attribute | explanation | -|--------------------|----------------------------------------| +| attribute | explanation | +|-----------|-------------| +| `node_phase` | The phase that the pod's node was in | | `recently_started` | Boolean: was this pod started recently | -| `node_phase` | The phase that the pod's node was in | `recently_started` is controlled by the [environment variable](environment-variables.md) `RECENTLY_STARTED_POD_DURATION` and defaults to 10 seconds. #### `pod_pending_count` -A counter of pods that have been seen in the Pending state. +Total number of pods that started pending by reason -| attribute | explanation | -|--------------------|-------------------------------------------| -| `reason` | Summary of the kubernetes Reason for pending. | -| `namespace` | The namespace in which the pod is running | +| attribute | explanation | +|-----------|-------------| +| `reason` | Summary of the kubernetes Reason for pending | +| `namespace` | The namespace in which the pod is running | -This metric ignores the `PodInitializing` reason and does not count it. -The `reason` attribute is the value from the Reason message before the `:` in the message. -This is not directly controlled by the workflow controller, so it is possible for some pod pending states to be missed. +#### `pods_gauge` + +A gauge of the number of workflow created pods currently in the cluster in each phase. +It is possible for a workflow to start, but no pods be running (for example cluster is too busy to run them). +This metric sheds light on actual work being done. + +| attribute | explanation | +|-----------|-------------| +| `phase` | The phase that the pod is in | #### `pods_total_count` -A gauge of the number of pods which have entered each phase and then observed by the controller. -This is not directly controlled by the workflow controller, so it is possible for some pod phases to be missed. +Total number of Pods that have entered each phase -| attribute | explanation | -|-------------|-------------------------------------------| -| `phase` | The phase that the pod is in | +| attribute | explanation | +|-----------|-------------| +| `phase` | The phase that the pod is in | | `namespace` | The namespace in which the pod is running | +This metric ignores the `PodInitializing` reason and does not count it. +The `reason` attribute is the value from the Reason message before the `:` in the message. +This is not directly controlled by the workflow controller, so it is possible for some pod pending states to be missed. + #### `queue_adds_count` -A counter of additions to the work queues inside the controller. -The rate of this shows how busy that area of the controller is. +A counter of additions to the work queues inside the controller +The rate of this shows how busy that area of the controller is -| attribute | explanation | -|---------------|-------------------| -| `worker_type` | The type of queue | +| attribute | explanation | +|-----------|-------------| +| `queue_name` | The name of the queue | Queues: @@ -421,102 +428,189 @@ This and associated metrics are all directly sourced from the [client-go workque A gauge of the current depth of the queues. If these get large then the workflow controller is not keeping up with the cluster. -See [queue adds count](#queue_adds_count) for details. +| attribute | explanation | +|-----------|-------------| +| `queue_name` | The name of the queue | + +Queues: + +- `cron_wf_queue`: the queue of CronWorkflow updates from the cluster +- `pod_cleanup_queue`: pods which are queued for deletion +- `workflow_queue`: the queue of Workflow updates from the cluster +- `workflow_ttl_queue`: workflows which are queued for deletion due to age +- `workflow_archive_queue`: workflows which are queued for archiving + +This and associated metrics are all directly sourced from the [client-go workqueue metrics](https://godocs.io/k8s.io/client-go/util/workqueue) #### `queue_duration` A histogram of the time events in the queues are taking to be processed. -See [queue adds count](#queue_adds_count) for details. +| attribute | explanation | +|-----------|-------------| +| `queue_name` | The name of the queue | + +Default bucket sizes: 0.1, 0.2, 0.5, 1, 2, 5, 10, 20, 60, 180 +Queues: + +- `cron_wf_queue`: the queue of CronWorkflow updates from the cluster +- `pod_cleanup_queue`: pods which are queued for deletion +- `workflow_queue`: the queue of Workflow updates from the cluster +- `workflow_ttl_queue`: workflows which are queued for deletion due to age +- `workflow_archive_queue`: workflows which are queued for archiving + +This and associated metrics are all directly sourced from the [client-go workqueue metrics](https://godocs.io/k8s.io/client-go/util/workqueue) #### `queue_latency` A histogram of the time events in the queues are taking before they are processed. -See [queue adds count](#queue_adds_count) for details. +| attribute | explanation | +|-----------|-------------| +| `queue_name` | The name of the queue | + +Default bucket sizes: 1, 5, 20, 60, 180 +Queues: + +- `cron_wf_queue`: the queue of CronWorkflow updates from the cluster +- `pod_cleanup_queue`: pods which are queued for deletion +- `workflow_queue`: the queue of Workflow updates from the cluster +- `workflow_ttl_queue`: workflows which are queued for deletion due to age +- `workflow_archive_queue`: workflows which are queued for archiving + +This and associated metrics are all directly sourced from the [client-go workqueue metrics](https://godocs.io/k8s.io/client-go/util/workqueue) #### `queue_longest_running` A gauge of the number of seconds that this queue's longest running processor has been running for. -See [queue adds count](#queue_adds_count) for details. +| attribute | explanation | +|-----------|-------------| +| `queue_name` | The name of the queue | + +Queues: + +- `cron_wf_queue`: the queue of CronWorkflow updates from the cluster +- `pod_cleanup_queue`: pods which are queued for deletion +- `workflow_queue`: the queue of Workflow updates from the cluster +- `workflow_ttl_queue`: workflows which are queued for deletion due to age +- `workflow_archive_queue`: workflows which are queued for archiving + +This and associated metrics are all directly sourced from the [client-go workqueue metrics](https://godocs.io/k8s.io/client-go/util/workqueue) #### `queue_retries` A counter of the number of times a message has been retried in the queue -See [queue adds count](#queue_adds_count) for details. +| attribute | explanation | +|-----------|-------------| +| `queue_name` | The name of the queue | + +Queues: + +- `cron_wf_queue`: the queue of CronWorkflow updates from the cluster +- `pod_cleanup_queue`: pods which are queued for deletion +- `workflow_queue`: the queue of Workflow updates from the cluster +- `workflow_ttl_queue`: workflows which are queued for deletion due to age +- `workflow_archive_queue`: workflows which are queued for archiving + +This and associated metrics are all directly sourced from the [client-go workqueue metrics](https://godocs.io/k8s.io/client-go/util/workqueue) #### `queue_unfinished_work` -A gauge of the number of queue items that have not been processed yet. +A gauge of the number of queue items that have not been processed yet -See [queue adds count](#queue_adds_count) for details. +| attribute | explanation | +|-----------|-------------| +| `queue_name` | The name of the queue | + +Queues: + +- `cron_wf_queue`: the queue of CronWorkflow updates from the cluster +- `pod_cleanup_queue`: pods which are queued for deletion +- `workflow_queue`: the queue of Workflow updates from the cluster +- `workflow_ttl_queue`: workflows which are queued for deletion due to age +- `workflow_archive_queue`: workflows which are queued for archiving + +This and associated metrics are all directly sourced from the [client-go workqueue metrics](https://godocs.io/k8s.io/client-go/util/workqueue) #### `total_count` -A counter of workflows that have entered each phase for tracking them through their life-cycle, by namespace. +A counter of workflows that have entered each phase for tracking them through their life-cycle, by namespace -| attribute | explanation | -|-------------|------------------------------------------------| -| `phase` | The phase that the workflow has entered | +| attribute | explanation | +|-----------|-------------| +| `phase` | The phase that the workflow has entered | | `namespace` | The namespace in which the workflow is running | #### `version` -Build metadata for this Controller. +Build metadata for this Controller -| attribute | explanation | -|------------------|-------------------------------------------------------------------------------------------------------| -| `version` | The version of Argo | -| `platform` | The [Go platform](https://go.dev/doc/install/source#environment) compiled for. Example: `linux/amd64` | -| `go_version` | Version of Go used | -| `build_date` | Build date | -| `compiler` | The compiler used. Example: `gc` | -| `git_commit` | The full Git SHA1 commit | -| `git_tree_state` | Whether the Git tree was `dirty` or `clean` when built | -| `git_tag` | The Git tag or `untagged` if it was not tagged | +| attribute | explanation | +|-----------|-------------| +| `version` | The version of Argo | +| `platform` | The [Go platform](https://go.dev/doc/install/source#environment) compiled for. Example: `linux/amd64` | +| `go_version` | Version of Go used | +| `build_date` | Build date | +| `compiler` | The compiler used. Example: `gc` | +| `git_commit` | The full Git SHA1 commit | +| `git_tree_state` | Whether the Git tree was `dirty` or `clean` when built | +| `git_tag` | The Git tag or `untagged` if it was not tagged | #### `workers_busy_count` -A count of queue workers that are busy. +A gauge of queue workers that are busy + +| attribute | explanation | +|-----------|-------------| +| `worker_type` | The type of queue | + +Worker Types: + +- `cron_wf_queue`: the queue of CronWorkflow updates from the cluster +- `pod_cleanup_queue`: pods which are queued for deletion +- `workflow_queue`: the queue of Workflow updates from the cluster +- `workflow_ttl_queue`: workflows which are queued for deletion due to age +- `workflow_archive_queue`: workflows which are queued for archiving -See [queue adds count](#queue_adds_count) for details. +This and associated metrics are all directly sourced from the [client-go workqueue metrics](https://godocs.io/k8s.io/client-go/util/workqueue) #### `workflow_condition` A gauge of the number of workflows with different conditions. This will tell you the number of workflows with running pods. -| attribute | explanation | -|-----------|-------------------------------------------------| -| `type` | the type of condition, currently only `Running` | -| `status` | `true` or `false` | +| attribute | explanation | +|-----------|-------------| +| `type` | The type of condition, currently only `PodRunning` | +| `status` | Boolean: `true` or `false` | #### `workflowtemplate_runtime` -A histogram of the duration of workflows using `workflowTemplateRef` only, as they enter each phase. +A histogram of the runtime of workflows using `workflowTemplateRef` only Counts both WorkflowTemplate and ClusterWorkflowTemplate usage. Records time between entering the `Running` phase and completion, so does not include any time in `Pending`. -| attribute | explanation | -|-----------------|--------------------------------------------------------------| -| `cluster_scope` | A boolean set true if this is a ClusterWorkflowTemplate | -| `name` | ⚠️ The name of the WorkflowTemplate/ClusterWorkflowTemplate. | -| `namespace` | The namespace from which the WorkflowTemplate is being used | +| attribute | explanation | +|-----------|-------------| +| `name` | ⚠️ The name of the WorkflowTemplate/ClusterWorkflowTemplate. | +| `namespace` | The namespace from which the WorkflowTemplate is being used | +| `cluster_scope` | A boolean set true if this is a ClusterWorkflowTemplate | #### `workflowtemplate_triggered_total` A counter of workflows using `workflowTemplateRef` only, as they enter each phase. Counts both WorkflowTemplate and ClusterWorkflowTemplate usage. -| attribute | explanation | -|-----------------|--------------------------------------------------------------| -| `cluster_scope` | A boolean set true if this is a ClusterWorkflowTemplate | -| `name` | ⚠️ The name of the WorkflowTemplate/ClusterWorkflowTemplate. | -| `namespace` | The namespace from which the WorkflowTemplate is being used | -| `phase` | The phase that the workflow entered | +| attribute | explanation | +|-----------|-------------| +| `name` | ⚠️ The name of the WorkflowTemplate/ClusterWorkflowTemplate. | +| `namespace` | The namespace from which the WorkflowTemplate is being used | +| `cluster_scope` | A boolean set true if this is a ClusterWorkflowTemplate | + + ### Metric types Please see the [Prometheus docs on metric types](https://prometheus.io/docs/concepts/metric_types/). diff --git a/util/telemetry/attributes.go b/util/telemetry/attributes.go index e2ca6700731f..b1e25a9975fc 100644 --- a/util/telemetry/attributes.go +++ b/util/telemetry/attributes.go @@ -1,46 +1,36 @@ +// Code generated by util/telemetry/builder. DO NOT EDIT. package telemetry const ( - AttribBuildVersion string = `version` - AttribBuildPlatform string = `platform` - AttribBuildGoVersion string = `go_version` - AttribBuildDate string = `build_date` AttribBuildCompiler string = `compiler` + AttribBuildDate string = `build_date` AttribBuildGitCommit string = `git_commit` - AttribBuildGitTreeState string = `git_treestate` AttribBuildGitTag string = `git_tag` - - AttribCronWFName string = `name` + AttribBuildGitTreeState string = `git_tree_state` + AttribBuildGoVersion string = `go_version` + AttribBuildPlatform string = `platform` + AttribBuildVersion string = `version` AttribConcurrencyPolicy string = `concurrency_policy` - - AttribDeprecatedFeature string = "feature" - - AttribErrorCause string = "cause" - - AttribLogLevel string = `level` - - AttribNodePhase string = `node_phase` - - AttribPodPhase string = `phase` - AttribPodNamespace string = `namespace` - AttribPodPendingReason string = `reason` - - AttribQueueName string = `queue_name` - - AttribRecentlyStarted string = `recently_started` - - AttribRequestKind = `kind` - AttribRequestVerb = `verb` - AttribRequestCode = `status_code` - + AttribCronWFName string = `name` + AttribCronWFNamespace string = `namespace` + AttribDeprecatedFeature string = `feature` + AttribErrorCause string = `cause` + AttribLogLevel string = `level` + AttribNodePhase string = `node_phase` + AttribPodNamespace string = `namespace` + AttribPodPendingReason string = `reason` + AttribPodPhase string = `phase` + AttribQueueName string = `queue_name` + AttribRecentlyStarted string = `recently_started` + AttribRequestCode string = `status_code` + AttribRequestKind string = `kind` + AttribRequestVerb string = `verb` + AttribTemplateCluster string = `cluster_scope` AttribTemplateName string = `name` AttribTemplateNamespace string = `namespace` - AttribTemplateCluster string = `cluster_scope` - - AttribWorkerType string = `worker_type` - + AttribWorkerType string = `worker_type` AttribWorkflowNamespace string = `namespace` AttribWorkflowPhase string = `phase` - AttribWorkflowStatus = `status` - AttribWorkflowType = `type` + AttribWorkflowStatus string = `status` + AttribWorkflowType string = `type` ) diff --git a/util/telemetry/builder/builder.go b/util/telemetry/builder/builder.go new file mode 100644 index 000000000000..899ed1a8c12f --- /dev/null +++ b/util/telemetry/builder/builder.go @@ -0,0 +1,342 @@ +package main + +import ( + "bytes" + _ "embed" + "errors" + "fmt" + "os" + "os/exec" + "regexp" + "slices" + "strings" + "unicode" + + "sigs.k8s.io/yaml" +) + +const generatedBanner string = "// Code generated by util/telemetry/builder. DO NOT EDIT." + +//go:embed values.yaml +var valuesYaml []byte + +type attribute struct { + Name string `json:"name"` + DisplayName string `json:"displayName,omitempty"` + // Description is a markdown explanation for the documentation. One line only. + Description string `json:"description"` +} + +type allowedAttribute struct { + Name string `json:"name"` + Optional bool `json:"optional,omitempty"` +} + +type metric struct { + // Name: Metric name, in CamelCaps + // Will be snake cased for display purposes + Name string `json:"name"` + // Description: short description, emitted on the metrics endpoint and added to the documentation. Do not use marrkdown here. + Description string `json:"description"` + // ExtendedDescription: Markdown capable further description added to the documentation before attributes + ExtendedDescription string `json:"extendedDescription,omitempty"` + // Notes: Markdown capable further description added to the documentation after attributes + Notes string `json:"notes,omitempty"` + Attributes []allowedAttribute `json:"attributes,omitempty"` + // Unit: OpenTelemetry unit of measurement https://opentelemetry.io/docs/specs/otel/metrics/api/#instrument-unit + Unit string `json:"unit"` + Type string `json:"type"` + DefaultBuckets []float64 `json:"defaultBuckets,omitempty"` +} + +type attributesList []attribute +type metricsList []metric + +type values struct { + Attributes attributesList `json:"attributes"` + Metrics metricsList `json:"metrics"` +} + +func load() values { + var vals values + err := yaml.UnmarshalStrict(valuesYaml, &vals) + if err != nil { + panic(err) + } + return vals +} + +var collectedErrors []error + +func recordErrorString(err string) { + collectedErrors = append(collectedErrors, errors.New(err)) +} +func recordError(err error) { + collectedErrors = append(collectedErrors, err) +} + +func main() { + vals := load() + validate(&vals) + if len(collectedErrors) == 0 { + createMetricsDocs(&vals.Metrics, &vals.Attributes) + createAttributesGo(&vals.Attributes) + createMetricsListGo(&vals.Metrics) + } + if len(collectedErrors) > 0 { + for _, err := range collectedErrors { + fmt.Println(err) + } + os.Exit(1) + } +} + +func upperToSnake(in string) string { + runes := []rune(in) + in = string(append([]rune{unicode.ToLower(runes[0])}, runes[1:]...)) + re := regexp.MustCompile(`[A-Z]`) + return string(re.ReplaceAllFunc([]byte(in), func(in []byte) []byte { + return []byte(fmt.Sprintf("_%s", strings.ToLower(string(in[0])))) + })) +} + +func (a *attribute) displayName() string { + name := a.Name + if a.DisplayName != "" { + name = a.DisplayName + } + return upperToSnake(name) +} + +func validateMetricsAttributes(metrics *metricsList, attributes *attributesList) { + for _, metric := range *metrics { + for _, attribute := range metric.Attributes { + if getAttribByName(attribute.Name, attributes) == nil { + recordErrorString(fmt.Sprintf("Metric %s: attribute %s not defined", metric.Name, attribute.Name)) + } + } + } +} + +func validateAttributes(attributes *attributesList) { + if !slices.IsSortedFunc(*attributes, func(a, b attribute) int { + return strings.Compare(a.Name, b.Name) + }) { + recordErrorString("Attributes must be alphabetically sorted by Name") + } + for _, attribute := range *attributes { + if strings.Contains(attribute.Description, "\n") { + recordErrorString(fmt.Sprintf("%s: Description must be a single line", attribute.Name)) + } + } +} + +func validateMetrics(metrics *metricsList) { + if !slices.IsSortedFunc(*metrics, func(a, b metric) int { + return strings.Compare(a.Name, b.Name) + }) { + recordErrorString("Metrics must be alphabetically sorted by Name") + } + for _, metric := range *metrics { + // This is easier than enum+custom JSON unmarshall as this is not critical code + switch metric.Type { + case "Float64Histogram": + case "Float64ObservableGauge": + case "Int64Counter": + case "Int64UpDownCounter": + case "Int64ObservableGauge": + break + default: + recordErrorString(fmt.Sprintf("%s: Invalid metric type %s", metric.Name, metric.Type)) + } + if strings.Contains(metric.Description, "\n") { + recordErrorString(fmt.Sprintf("%s: Description must be a single line", metric.Name)) + } + } +} + +func validate(vals *values) { + validateAttributes(&vals.Attributes) + validateMetrics(&vals.Metrics) + validateMetricsAttributes(&vals.Metrics, &vals.Attributes) +} + +func createMetricsListGo(metrics *metricsList) { + filename := "../metrics_list.go" + writeMetricsListGo(filename, metrics) + goFmtFile(filename) +} + +func (m *metric) instrumentType() string { + return m.Type +} + +func (m *metric) displayName() string { + name := m.Name + return upperToSnake(name) +} + +func getAttribByName(name string, attribs *attributesList) *attribute { + for _, attrib := range *attribs { + if name == attrib.Name { + return &attrib + } + } + return nil +} + +func createMetricsDocs(metrics *metricsList, attribs *attributesList) { + filename := "../../../docs/metrics.md" + input, err := os.ReadFile(filename) + if err != nil { + recordError(err) + return + } + + lines := strings.Split(string(input), "\n") + + const begin = "Generated documentation BEGIN" + const end = "Generated documentation END" + stage := "before begin" + var cutfrom int + for i, line := range lines { + switch stage { + case "before begin": + if strings.Contains(line, begin) { + stage = "seeking end" + lines = slices.Insert(lines, i+1, metricsDocsLines(metrics, attribs)) + cutfrom = i + 2 + } + case "seeking end": + if strings.Contains(line, end) { + stage = "finishing" + lines = slices.Delete(lines, cutfrom, i+1) + } + case "finishing": + // Do nothing + } + } + if stage != "finishing" { + recordErrorString(fmt.Sprintf("Didn't successfully replace docs in %s", filename)) + return + } + + output := strings.Join(lines, "\n") + err = os.WriteFile(filename, []byte(output), 0444) + if err != nil { + recordError(err) + return + } +} + +func metricsDocsLines(metrics *metricsList, attribs *attributesList) string { + var out bytes.Buffer + fmt.Fprintf(&out, "\n") + for _, metric := range *metrics { + fmt.Fprintf(&out, "#### `%s`\n\n", metric.displayName()) + fmt.Fprintf(&out, "%s\n", metric.Description) + if metric.ExtendedDescription != "" { + fmt.Fprintf(&out, "%s\n", strings.Trim(metric.ExtendedDescription, " \n\t\r")) + } + fmt.Fprintf(&out, "\n") + + if len(metric.Attributes) > 0 { + fmt.Fprintf(&out, "| attribute | explanation |\n") + fmt.Fprintf(&out, "|-----------|-------------|\n") + for _, metricAttrib := range metric.Attributes { + if attrib := getAttribByName(metricAttrib.Name, attribs); attrib != nil { + // Failure should already be recorded as an error + fmt.Fprintf(&out, "| `%s` | %s |\n", attrib.displayName(), attrib.Description) + } + } + fmt.Fprintf(&out, "\n") + } else { + fmt.Fprintf(&out, "This metric has no attributes.\n\n") + } + if len(metric.DefaultBuckets) > 0 { + fmt.Fprintf(&out, "Default bucket sizes: ") + for i, bucket := range metric.DefaultBuckets { + if i != 0 { + fmt.Fprintf(&out, ", ") + } + fmt.Fprintf(&out, "%g", bucket) + } + fmt.Fprintf(&out, "\n") + } + + if metric.Notes != "" { + fmt.Fprintf(&out, "%s\n", strings.Trim(metric.Notes, " \n\t\r")) + fmt.Fprintf(&out, "\n") + } + } + return out.String() +} + +func writeMetricsListGo(filename string, metrics *metricsList) { + f, err := os.Create(filename) + if err != nil { + recordError(err) + return + } + defer f.Close() + fmt.Fprintf(f, "%s\n", generatedBanner) + fmt.Fprintf(f, "package telemetry\n\n") + for _, metric := range *metrics { + fmt.Fprintf(f, "var Instrument%s = BuiltinInstrument{\n", metric.Name) + fmt.Fprintf(f, "\tname: \"%s\",\n", metric.displayName()) + fmt.Fprintf(f, "\tdescription: \"%s\",\n", metric.Description) + fmt.Fprintf(f, "\tunit: \"%s\",\n", metric.Unit) + fmt.Fprintf(f, "\tinstType: %s,\n", metric.instrumentType()) + if len(metric.Attributes) > 0 { + fmt.Fprintf(f, "\tattributes: []BuiltinAttribute{\n") + for _, attrib := range metric.Attributes { + fmt.Fprintf(f, "\t\t{\n\t\t\tname: Attrib%s,\n", attrib.Name) + if attrib.Optional { + fmt.Fprintf(f, "\t\t\toptional: true,\n") + } + fmt.Fprintf(f, "\t\t},\n") + } + fmt.Fprintf(f, "\t},\n") + } + if len(metric.DefaultBuckets) > 0 { + fmt.Fprintf(f, "\tdefaultBuckets: []float64{\n") + for _, bucket := range metric.DefaultBuckets { + fmt.Fprintf(f, "\t\t%f,\n", bucket) + } + fmt.Fprintf(f, "\t},\n") + } + + fmt.Fprintf(f, "}\n\n") + } +} + +func createAttributesGo(attributes *attributesList) { + filename := "../attributes.go" + writeAttributesGo(filename, attributes) + goFmtFile(filename) +} + +func writeAttributesGo(filename string, attributes *attributesList) { + f, err := os.Create(filename) + if err != nil { + recordError(err) + return + } + defer f.Close() + fmt.Fprintf(f, "%s\n", generatedBanner) + fmt.Fprintf(f, "package telemetry\n\nconst (\n") + for _, attrib := range *attributes { + fmt.Fprintf(f, "\tAttrib%s string = `%s`\n", attrib.Name, attrib.displayName()) + } + fmt.Fprintf(f, ")\n") +} + +func goFmtFile(filename string) { + cmd := exec.Command("go", "fmt", filename) + var stderr bytes.Buffer + cmd.Stderr = &stderr + _, err := cmd.Output() + if err != nil { + recordErrorString(fmt.Sprintf("%s: %s", "go fmt failed", stderr.String())) + } +} diff --git a/util/telemetry/builder/values.yaml b/util/telemetry/builder/values.yaml new file mode 100644 index 000000000000..ab38becb541e --- /dev/null +++ b/util/telemetry/builder/values.yaml @@ -0,0 +1,406 @@ +attributes: + - name: BuildCompiler + displayName: compiler + description: "The compiler used. Example: `gc`" + - name: BuildDate + description: Build date + - name: BuildGitCommit + displayName: git_commit + description: The full Git SHA1 commit + - name: BuildGitTag + displayName: git_tag + description: "The Git tag or `untagged` if it was not tagged" + - name: BuildGitTreeState + displayName: git_tree_state + description: "Whether the Git tree was `dirty` or `clean` when built" + - name: BuildGoVersion + displayName: go_version + description: Version of Go used + - name: BuildPlatform + displayName: platform + description: "The [Go platform](https://go.dev/doc/install/source#environment) compiled for. Example: `linux/amd64`" + - name: BuildVersion + displayName: version + description: The version of Argo + - name: ConcurrencyPolicy + description: "The concurrency policy which was triggered, will be either `Forbid` or `Replace`" + - name: CronWFName + displayName: name + description: "⚠️ The name of the CronWorkflow" + - name: CronWFNamespace + displayName: namespace + description: "The namespace of the CronWorkflow" + - name: DeprecatedFeature + displayName: feature + description: The name of the feature used + - name: ErrorCause + displayName: cause + description: The cause of the error + - name: LogLevel + displayName: level + description: The log level of the message + - name: NodePhase + description: "The phase that the pod's node was in" + - name: PodNamespace + displayName: namespace + description: The namespace in which the pod is running + - name: PodPendingReason + displayName: reason + description: Summary of the kubernetes Reason for pending + - name: PodPhase + displayName: phase + description: The phase that the pod is in + - name: QueueName + description: The name of the queue + - name: RecentlyStarted + description: "Boolean: was this pod started recently" + - name: RequestCode + displayName: status_code + description: The HTTP status code of the response + - name: RequestKind + displayName: kind + description: "The kubernetes `kind` involved in the request such as `configmaps`" + - name: RequestVerb + displayName: verb + description: "The verb of the request, such as `Get` or `List`" + - name: TemplateCluster + displayName: cluster_scope + description: A boolean set true if this is a ClusterWorkflowTemplate + - name: TemplateName + displayName: name + description: "⚠️ The name of the WorkflowTemplate/ClusterWorkflowTemplate." + - name: TemplateNamespace + displayName: namespace + description: The namespace from which the WorkflowTemplate is being used + - name: WorkerType + description: The type of queue + - name: WorkflowNamespace + displayName: namespace + description: The namespace in which the workflow is running + - name: WorkflowPhase + displayName: phase + description: The phase that the workflow has entered + - name: WorkflowStatus + displayName: status + description: "Boolean: `true` or `false`" + - name: WorkflowType + displayName: type + description: "The type of condition, currently only `PodRunning`" + +metrics: + - name: CronworkflowsConcurrencypolicyTriggered + description: A counter of the number of times a CronWorkflow has triggered its `concurrencyPolicy` to limit the number of workflows running. + attributes: + - name: CronWFName + - name: CronWFNamespace + - name: ConcurrencyPolicy + unit: "{cronworkflow}" + type: Int64Counter + - name: CronworkflowsTriggeredTotal + description: A counter of the total number of times a CronWorkflow has been triggered + extendedDescription: "Suppressed runs due to `concurrencyPolicy: Forbid` will not be counted." + attributes: + - name: CronWFName + - name: CronWFNamespace + unit: "{cronworkflow}" + type: Int64Counter + - name: DeprecatedFeature + description: "Incidents of deprecated feature being used." + extendedDescription: | + Deprecated features are [explained here](deprecations.md). + 🚨 This counter may go up much more than once for a single use of the feature. + notes: | + `feature` will be one of: + + - [`cronworkflow schedule`](deprecations.md#cronworkflow_schedule) + - [`synchronization mutex`](deprecations.md#synchronization_mutex) + - [`synchronization semaphore`](deprecations.md#synchronization_semaphore) + - [`workflow podpriority`](deprecations.md#workflow_podpriority) + attributes: + - name: DeprecatedFeature + - name: WorkflowNamespace + optional: true + unit: "{feature}" + type: Int64Counter + - name: ErrorCount + description: A counter of certain errors incurred by the controller by cause. + notes: | + The currently tracked specific errors are + + - `OperationPanic` - the controller called `panic()` on encounterin a programming bug + - `CronWorkflowSubmissionError` - A CronWorkflow failed submission + - `CronWorkflowSpecError` - A CronWorkflow has an invalid specification + attributes: + - name: ErrorCause + unit: "{error}" + type: Int64Counter + - name: Gauge + description: A gauge of the number of workflows currently in the cluster in each phase + extendedDescription: | + The `Running` count does not mean that a workflows pods are running, just that the controller has scheduled them. + A workflow can be stuck in `Running` with pending pods for a long time. + attributes: + - name: WorkflowStatus + unit: "{workflow}" + type: Int64ObservableGauge + - name: IsLeader + description: Emits 1 if leader, 0 otherwise. Always 1 if leader election is disabled + extendedDescription: | + A gauge indicating if this Controller is the [leader](high-availability.md#workflow-controller). + + - `1` if leader or in standalone mode via [`LEADER_ELECTION_DISABLE=true`](environment-variables.md#controller). + - `0` otherwise, indicating that this controller is a standby that is not currently running workflows. + unit: "{leader}" + type: Int64ObservableGauge + - name: K8sRequestDuration + description: A histogram recording the API requests sent to the Kubernetes API. + notes: This contains all the information contained in `k8s_request_total` along with timings. + attributes: + - name: RequestKind + - name: RequestVerb + - name: RequestCode + unit: "s" + type: Float64Histogram + defaultBuckets: [0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0, 60.0, 180.0] + - name: K8sRequestTotal + description: A counter of the number of API requests sent to the Kubernetes API. + notes: This metric is calculable from `k8s_request_duration`, and it is suggested you just collect that metric instead. + attributes: + - name: RequestKind + - name: RequestVerb + - name: RequestCode + unit: "{request}" + type: Int64Counter + - name: LogMessages + description: "A count of log messages emitted by the controller by log level: `error`, `warn` and `info`" + attributes: + - name: LogLevel + unit: "{message}" + type: Int64Counter + - name: OperationDurationSeconds + description: A histogram of durations of operations + extendedDescription: | + An operation is a single workflow reconciliation loop within the workflow-controller. + It's the time for the controller to process a single workflow after it has been read from the cluster and is a measure of the performance of the controller affected by the complexity of the workflow. + notes: The environment variables `OPERATION_DURATION_METRIC_BUCKET_COUNT` and `MAX_OPERATION_TIME` configure the bucket sizes for this metric, unless they are specified using an `histogramBuckets` modifier in the `metricsConfig` block. + unit: "s" + type: Float64Histogram + - name: PodMissing + description: "Incidents of pod missing." + extendedDescription: | + A counter of pods that were not seen - for example they are by being deleted by Kubernetes. + You should only see this under high load. + attributes: + - name: NodePhase + - name: RecentlyStarted + notes: "`recently_started` is controlled by the [environment variable](environment-variables.md) `RECENTLY_STARTED_POD_DURATION` and defaults to 10 seconds." + unit: "{pod}" + type: Int64Counter + - name: PodPendingCount + description: "Total number of pods that started pending by reason" + attributes: + - name: PodPendingReason + - name: PodNamespace + unit: "{pod}" + type: Int64Counter + - name: PodsGauge + description: A gauge of the number of workflow created pods currently in the cluster in each phase. + extendedDescription: | + It is possible for a workflow to start, but no pods be running (for example cluster is too busy to run them). + This metric sheds light on actual work being done. + attributes: + - name: PodPhase + unit: "{pod}" + type: Int64ObservableGauge + - name: PodsTotalCount + description: "Total number of Pods that have entered each phase" + attributes: + - name: PodPhase + - name: PodNamespace + notes: | + This metric ignores the `PodInitializing` reason and does not count it. + The `reason` attribute is the value from the Reason message before the `:` in the message. + This is not directly controlled by the workflow controller, so it is possible for some pod pending states to be missed. + unit: "{pod}" + type: Int64ObservableGauge + - name: QueueAddsCount + description: A counter of additions to the work queues inside the controller + extendedDescription: The rate of this shows how busy that area of the controller is + notes: | + Queues: + + - `cron_wf_queue`: the queue of CronWorkflow updates from the cluster + - `pod_cleanup_queue`: pods which are queued for deletion + - `workflow_queue`: the queue of Workflow updates from the cluster + - `workflow_ttl_queue`: workflows which are queued for deletion due to age + - `workflow_archive_queue`: workflows which are queued for archiving + + This and associated metrics are all directly sourced from the [client-go workqueue metrics](https://godocs.io/k8s.io/client-go/util/workqueue) + attributes: + - name: QueueName + unit: "{item}" + type: Int64Counter + - name: QueueDepthGauge + description: A gauge of the current depth of the queues. + extendedDescription: If these get large then the workflow controller is not keeping up with the cluster. + notes: | + Queues: + + - `cron_wf_queue`: the queue of CronWorkflow updates from the cluster + - `pod_cleanup_queue`: pods which are queued for deletion + - `workflow_queue`: the queue of Workflow updates from the cluster + - `workflow_ttl_queue`: workflows which are queued for deletion due to age + - `workflow_archive_queue`: workflows which are queued for archiving + + This and associated metrics are all directly sourced from the [client-go workqueue metrics](https://godocs.io/k8s.io/client-go/util/workqueue) + attributes: + - name: QueueName + unit: "{item}" + type: Int64UpDownCounter + - name: QueueDuration + description: A histogram of the time events in the queues are taking to be processed. + notes: | + Queues: + + - `cron_wf_queue`: the queue of CronWorkflow updates from the cluster + - `pod_cleanup_queue`: pods which are queued for deletion + - `workflow_queue`: the queue of Workflow updates from the cluster + - `workflow_ttl_queue`: workflows which are queued for deletion due to age + - `workflow_archive_queue`: workflows which are queued for archiving + + This and associated metrics are all directly sourced from the [client-go workqueue metrics](https://godocs.io/k8s.io/client-go/util/workqueue) + attributes: + - name: QueueName + unit: s + type: Float64Histogram + defaultBuckets: [0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0, 60.0, 180.0] + - name: QueueLatency + description: A histogram of the time events in the queues are taking before they are processed. + notes: | + Queues: + + - `cron_wf_queue`: the queue of CronWorkflow updates from the cluster + - `pod_cleanup_queue`: pods which are queued for deletion + - `workflow_queue`: the queue of Workflow updates from the cluster + - `workflow_ttl_queue`: workflows which are queued for deletion due to age + - `workflow_archive_queue`: workflows which are queued for archiving + + This and associated metrics are all directly sourced from the [client-go workqueue metrics](https://godocs.io/k8s.io/client-go/util/workqueue) + attributes: + - name: QueueName + unit: s + type: Float64Histogram + defaultBuckets: [1.0, 5.0, 20.0, 60.0, 180.0] + - name: QueueLongestRunning + description: A gauge of the number of seconds that this queue's longest running processor has been running for. + notes: | + Queues: + + - `cron_wf_queue`: the queue of CronWorkflow updates from the cluster + - `pod_cleanup_queue`: pods which are queued for deletion + - `workflow_queue`: the queue of Workflow updates from the cluster + - `workflow_ttl_queue`: workflows which are queued for deletion due to age + - `workflow_archive_queue`: workflows which are queued for archiving + + This and associated metrics are all directly sourced from the [client-go workqueue metrics](https://godocs.io/k8s.io/client-go/util/workqueue) + attributes: + - name: QueueName + unit: s + type: Float64ObservableGauge + - name: QueueRetries + description: A counter of the number of times a message has been retried in the queue + notes: | + Queues: + + - `cron_wf_queue`: the queue of CronWorkflow updates from the cluster + - `pod_cleanup_queue`: pods which are queued for deletion + - `workflow_queue`: the queue of Workflow updates from the cluster + - `workflow_ttl_queue`: workflows which are queued for deletion due to age + - `workflow_archive_queue`: workflows which are queued for archiving + + This and associated metrics are all directly sourced from the [client-go workqueue metrics](https://godocs.io/k8s.io/client-go/util/workqueue) + attributes: + - name: QueueName + unit: "{item}" + type: Int64Counter + - name: QueueUnfinishedWork + description: A gauge of the number of queue items that have not been processed yet + notes: | + Queues: + + - `cron_wf_queue`: the queue of CronWorkflow updates from the cluster + - `pod_cleanup_queue`: pods which are queued for deletion + - `workflow_queue`: the queue of Workflow updates from the cluster + - `workflow_ttl_queue`: workflows which are queued for deletion due to age + - `workflow_archive_queue`: workflows which are queued for archiving + + This and associated metrics are all directly sourced from the [client-go workqueue metrics](https://godocs.io/k8s.io/client-go/util/workqueue) + attributes: + - name: QueueName + unit: "{item}" + type: Float64ObservableGauge + - name: TotalCount + description: A counter of workflows that have entered each phase for tracking them through their life-cycle, by namespace + attributes: + - name: WorkflowPhase + - name: WorkflowNamespace + unit: "{workflow}" + type: Int64Counter + - name: Version + description: "Build metadata for this Controller" + attributes: + - name: BuildVersion + - name: BuildPlatform + - name: BuildGoVersion + - name: BuildDate + - name: BuildCompiler + - name: BuildGitCommit + - name: BuildGitTreeState + - name: BuildGitTag + unit: "{unused}" + type: Int64Counter + - name: WorkersBusyCount + description: A gauge of queue workers that are busy + notes: | + Worker Types: + + - `cron_wf_queue`: the queue of CronWorkflow updates from the cluster + - `pod_cleanup_queue`: pods which are queued for deletion + - `workflow_queue`: the queue of Workflow updates from the cluster + - `workflow_ttl_queue`: workflows which are queued for deletion due to age + - `workflow_archive_queue`: workflows which are queued for archiving + + This and associated metrics are all directly sourced from the [client-go workqueue metrics](https://godocs.io/k8s.io/client-go/util/workqueue) + attributes: + - name: WorkerType + unit: "{worker}" + type: Int64UpDownCounter + - name: WorkflowCondition + description: A gauge of the number of workflows with different conditions. + extendedDescription: This will tell you the number of workflows with running pods. + attributes: + - name: WorkflowType + - name: WorkflowStatus + unit: "{workflow}" + type: Int64ObservableGauge + - name: WorkflowtemplateRuntime + description: A histogram of the runtime of workflows using `workflowTemplateRef` only + extendedDescription: | + Counts both WorkflowTemplate and ClusterWorkflowTemplate usage. + Records time between entering the `Running` phase and completion, so does not include any time in `Pending`. + attributes: + - name: TemplateName + - name: TemplateNamespace + - name: TemplateCluster + unit: s + type: Float64Histogram + - name: WorkflowtemplateTriggeredTotal + description: A counter of workflows using `workflowTemplateRef` only, as they enter each phase. + extendedDescription: | + Counts both WorkflowTemplate and ClusterWorkflowTemplate usage. + attributes: + - name: TemplateName + - name: TemplateNamespace + - name: TemplateCluster + unit: "{workflow_template}" + type: Int64Counter diff --git a/util/telemetry/builtinInstrument.go b/util/telemetry/builtinInstrument.go new file mode 100644 index 000000000000..e8b242ae15fe --- /dev/null +++ b/util/telemetry/builtinInstrument.go @@ -0,0 +1,37 @@ +package telemetry + +//import () + +type BuiltinAttribute struct { + name string + optional bool +} + +type BuiltinInstrument struct { + name string + description string + unit string + instType instrumentType + attributes []BuiltinAttribute + defaultBuckets []float64 +} + +func (bi *BuiltinInstrument) Name() string { + return bi.name +} + +// CreateBuiltinInstrument adds a yaml defined builtin instrument +// opts parameter is for legacy metrics, do not use for new metrics +func (m *Metrics) CreateBuiltinInstrument(instrument BuiltinInstrument, opts ...instrumentOption) error { + opts = append(opts, WithAsBuiltIn()) + if len(instrument.defaultBuckets) > 0 { + opts = append(opts, + WithDefaultBuckets(instrument.defaultBuckets)) + } + return m.CreateInstrument(instrument.instType, + instrument.name, + instrument.description, + instrument.unit, + opts..., + ) +} diff --git a/util/telemetry/counter_deprecations.go b/util/telemetry/counter_deprecations.go index 7f4bbec320f2..3fd8c90461a2 100644 --- a/util/telemetry/counter_deprecations.go +++ b/util/telemetry/counter_deprecations.go @@ -4,17 +4,8 @@ import ( "context" ) -const ( - nameDeprecated = `deprecated_feature` -) - func AddDeprecationCounter(_ context.Context, m *Metrics) error { - return m.CreateInstrument(Int64Counter, - nameDeprecated, - "Incidents of deprecated feature being used.", - "{feature}", - WithAsBuiltIn(), - ) + return m.CreateBuiltinInstrument(InstrumentDeprecatedFeature) } func (m *Metrics) DeprecatedFeature(ctx context.Context, deprecation string, namespace string) { @@ -24,5 +15,5 @@ func (m *Metrics) DeprecatedFeature(ctx context.Context, deprecation string, nam if namespace != "" { attribs = append(attribs, InstAttrib{Name: AttribWorkflowNamespace, Value: namespace}) } - m.AddInt(ctx, nameDeprecated, 1, attribs) + m.AddInt(ctx, InstrumentDeprecatedFeature.Name(), 1, attribs) } diff --git a/util/telemetry/metrics_list.go b/util/telemetry/metrics_list.go new file mode 100644 index 000000000000..c90eccb616e2 --- /dev/null +++ b/util/telemetry/metrics_list.go @@ -0,0 +1,420 @@ +// Code generated by util/telemetry/builder. DO NOT EDIT. +package telemetry + +var InstrumentCronworkflowsConcurrencypolicyTriggered = BuiltinInstrument{ + name: "cronworkflows_concurrencypolicy_triggered", + description: "A counter of the number of times a CronWorkflow has triggered its `concurrencyPolicy` to limit the number of workflows running.", + unit: "{cronworkflow}", + instType: Int64Counter, + attributes: []BuiltinAttribute{ + { + name: AttribCronWFName, + }, + { + name: AttribCronWFNamespace, + }, + { + name: AttribConcurrencyPolicy, + }, + }, +} + +var InstrumentCronworkflowsTriggeredTotal = BuiltinInstrument{ + name: "cronworkflows_triggered_total", + description: "A counter of the total number of times a CronWorkflow has been triggered", + unit: "{cronworkflow}", + instType: Int64Counter, + attributes: []BuiltinAttribute{ + { + name: AttribCronWFName, + }, + { + name: AttribCronWFNamespace, + }, + }, +} + +var InstrumentDeprecatedFeature = BuiltinInstrument{ + name: "deprecated_feature", + description: "Incidents of deprecated feature being used.", + unit: "{feature}", + instType: Int64Counter, + attributes: []BuiltinAttribute{ + { + name: AttribDeprecatedFeature, + }, + { + name: AttribWorkflowNamespace, + optional: true, + }, + }, +} + +var InstrumentErrorCount = BuiltinInstrument{ + name: "error_count", + description: "A counter of certain errors incurred by the controller by cause.", + unit: "{error}", + instType: Int64Counter, + attributes: []BuiltinAttribute{ + { + name: AttribErrorCause, + }, + }, +} + +var InstrumentGauge = BuiltinInstrument{ + name: "gauge", + description: "A gauge of the number of workflows currently in the cluster in each phase", + unit: "{workflow}", + instType: Int64ObservableGauge, + attributes: []BuiltinAttribute{ + { + name: AttribWorkflowStatus, + }, + }, +} + +var InstrumentIsLeader = BuiltinInstrument{ + name: "is_leader", + description: "Emits 1 if leader, 0 otherwise. Always 1 if leader election is disabled", + unit: "{leader}", + instType: Int64ObservableGauge, +} + +var InstrumentK8sRequestDuration = BuiltinInstrument{ + name: "k8s_request_duration", + description: "A histogram recording the API requests sent to the Kubernetes API.", + unit: "s", + instType: Float64Histogram, + attributes: []BuiltinAttribute{ + { + name: AttribRequestKind, + }, + { + name: AttribRequestVerb, + }, + { + name: AttribRequestCode, + }, + }, + defaultBuckets: []float64{ + 0.100000, + 0.200000, + 0.500000, + 1.000000, + 2.000000, + 5.000000, + 10.000000, + 20.000000, + 60.000000, + 180.000000, + }, +} + +var InstrumentK8sRequestTotal = BuiltinInstrument{ + name: "k8s_request_total", + description: "A counter of the number of API requests sent to the Kubernetes API.", + unit: "{request}", + instType: Int64Counter, + attributes: []BuiltinAttribute{ + { + name: AttribRequestKind, + }, + { + name: AttribRequestVerb, + }, + { + name: AttribRequestCode, + }, + }, +} + +var InstrumentLogMessages = BuiltinInstrument{ + name: "log_messages", + description: "A count of log messages emitted by the controller by log level: `error`, `warn` and `info`", + unit: "{message}", + instType: Int64Counter, + attributes: []BuiltinAttribute{ + { + name: AttribLogLevel, + }, + }, +} + +var InstrumentOperationDurationSeconds = BuiltinInstrument{ + name: "operation_duration_seconds", + description: "A histogram of durations of operations", + unit: "s", + instType: Float64Histogram, +} + +var InstrumentPodMissing = BuiltinInstrument{ + name: "pod_missing", + description: "Incidents of pod missing.", + unit: "{pod}", + instType: Int64Counter, + attributes: []BuiltinAttribute{ + { + name: AttribNodePhase, + }, + { + name: AttribRecentlyStarted, + }, + }, +} + +var InstrumentPodPendingCount = BuiltinInstrument{ + name: "pod_pending_count", + description: "Total number of pods that started pending by reason", + unit: "{pod}", + instType: Int64Counter, + attributes: []BuiltinAttribute{ + { + name: AttribPodPendingReason, + }, + { + name: AttribPodNamespace, + }, + }, +} + +var InstrumentPodsGauge = BuiltinInstrument{ + name: "pods_gauge", + description: "A gauge of the number of workflow created pods currently in the cluster in each phase.", + unit: "{pod}", + instType: Int64ObservableGauge, + attributes: []BuiltinAttribute{ + { + name: AttribPodPhase, + }, + }, +} + +var InstrumentPodsTotalCount = BuiltinInstrument{ + name: "pods_total_count", + description: "Total number of Pods that have entered each phase", + unit: "{pod}", + instType: Int64ObservableGauge, + attributes: []BuiltinAttribute{ + { + name: AttribPodPhase, + }, + { + name: AttribPodNamespace, + }, + }, +} + +var InstrumentQueueAddsCount = BuiltinInstrument{ + name: "queue_adds_count", + description: "A counter of additions to the work queues inside the controller", + unit: "{item}", + instType: Int64Counter, + attributes: []BuiltinAttribute{ + { + name: AttribQueueName, + }, + }, +} + +var InstrumentQueueDepthGauge = BuiltinInstrument{ + name: "queue_depth_gauge", + description: "A gauge of the current depth of the queues.", + unit: "{item}", + instType: Int64UpDownCounter, + attributes: []BuiltinAttribute{ + { + name: AttribQueueName, + }, + }, +} + +var InstrumentQueueDuration = BuiltinInstrument{ + name: "queue_duration", + description: "A histogram of the time events in the queues are taking to be processed.", + unit: "s", + instType: Float64Histogram, + attributes: []BuiltinAttribute{ + { + name: AttribQueueName, + }, + }, + defaultBuckets: []float64{ + 0.100000, + 0.200000, + 0.500000, + 1.000000, + 2.000000, + 5.000000, + 10.000000, + 20.000000, + 60.000000, + 180.000000, + }, +} + +var InstrumentQueueLatency = BuiltinInstrument{ + name: "queue_latency", + description: "A histogram of the time events in the queues are taking before they are processed.", + unit: "s", + instType: Float64Histogram, + attributes: []BuiltinAttribute{ + { + name: AttribQueueName, + }, + }, + defaultBuckets: []float64{ + 1.000000, + 5.000000, + 20.000000, + 60.000000, + 180.000000, + }, +} + +var InstrumentQueueLongestRunning = BuiltinInstrument{ + name: "queue_longest_running", + description: "A gauge of the number of seconds that this queue's longest running processor has been running for.", + unit: "s", + instType: Float64ObservableGauge, + attributes: []BuiltinAttribute{ + { + name: AttribQueueName, + }, + }, +} + +var InstrumentQueueRetries = BuiltinInstrument{ + name: "queue_retries", + description: "A counter of the number of times a message has been retried in the queue", + unit: "{item}", + instType: Int64Counter, + attributes: []BuiltinAttribute{ + { + name: AttribQueueName, + }, + }, +} + +var InstrumentQueueUnfinishedWork = BuiltinInstrument{ + name: "queue_unfinished_work", + description: "A gauge of the number of queue items that have not been processed yet", + unit: "{item}", + instType: Float64ObservableGauge, + attributes: []BuiltinAttribute{ + { + name: AttribQueueName, + }, + }, +} + +var InstrumentTotalCount = BuiltinInstrument{ + name: "total_count", + description: "A counter of workflows that have entered each phase for tracking them through their life-cycle, by namespace", + unit: "{workflow}", + instType: Int64Counter, + attributes: []BuiltinAttribute{ + { + name: AttribWorkflowPhase, + }, + { + name: AttribWorkflowNamespace, + }, + }, +} + +var InstrumentVersion = BuiltinInstrument{ + name: "version", + description: "Build metadata for this Controller", + unit: "{unused}", + instType: Int64Counter, + attributes: []BuiltinAttribute{ + { + name: AttribBuildVersion, + }, + { + name: AttribBuildPlatform, + }, + { + name: AttribBuildGoVersion, + }, + { + name: AttribBuildDate, + }, + { + name: AttribBuildCompiler, + }, + { + name: AttribBuildGitCommit, + }, + { + name: AttribBuildGitTreeState, + }, + { + name: AttribBuildGitTag, + }, + }, +} + +var InstrumentWorkersBusyCount = BuiltinInstrument{ + name: "workers_busy_count", + description: "A gauge of queue workers that are busy", + unit: "{worker}", + instType: Int64UpDownCounter, + attributes: []BuiltinAttribute{ + { + name: AttribWorkerType, + }, + }, +} + +var InstrumentWorkflowCondition = BuiltinInstrument{ + name: "workflow_condition", + description: "A gauge of the number of workflows with different conditions.", + unit: "{workflow}", + instType: Int64ObservableGauge, + attributes: []BuiltinAttribute{ + { + name: AttribWorkflowType, + }, + { + name: AttribWorkflowStatus, + }, + }, +} + +var InstrumentWorkflowtemplateRuntime = BuiltinInstrument{ + name: "workflowtemplate_runtime", + description: "A histogram of the runtime of workflows using `workflowTemplateRef` only", + unit: "s", + instType: Float64Histogram, + attributes: []BuiltinAttribute{ + { + name: AttribTemplateName, + }, + { + name: AttribTemplateNamespace, + }, + { + name: AttribTemplateCluster, + }, + }, +} + +var InstrumentWorkflowtemplateTriggeredTotal = BuiltinInstrument{ + name: "workflowtemplate_triggered_total", + description: "A counter of workflows using `workflowTemplateRef` only, as they enter each phase.", + unit: "{workflow_template}", + instType: Int64Counter, + attributes: []BuiltinAttribute{ + { + name: AttribTemplateName, + }, + { + name: AttribTemplateNamespace, + }, + { + name: AttribTemplateCluster, + }, + }, +} diff --git a/util/telemetry/version.go b/util/telemetry/version.go index 055aa038bf74..fb951c729dce 100644 --- a/util/telemetry/version.go +++ b/util/telemetry/version.go @@ -7,19 +7,13 @@ import ( ) func AddVersion(ctx context.Context, m *Metrics) error { - const nameVersion = `version` - err := m.CreateInstrument(Int64Counter, - nameVersion, - "Build metadata for this Controller", - "{unused}", - WithAsBuiltIn(), - ) + err := m.CreateBuiltinInstrument(InstrumentVersion) if err != nil { return err } version := argo.GetVersion() - m.AddInt(ctx, nameVersion, 1, InstAttribs{ + m.AddInt(ctx, InstrumentVersion.Name(), 1, InstAttribs{ {Name: AttribBuildVersion, Value: version.Version}, {Name: AttribBuildPlatform, Value: version.Platform}, {Name: AttribBuildGoVersion, Value: version.GoVersion}, diff --git a/workflow/metrics/counter_cronworkflow_policy.go b/workflow/metrics/counter_cronworkflow_policy.go index d77e5172dfeb..bf2aad2f857f 100644 --- a/workflow/metrics/counter_cronworkflow_policy.go +++ b/workflow/metrics/counter_cronworkflow_policy.go @@ -7,23 +7,14 @@ import ( "github.com/argoproj/argo-workflows/v3/util/telemetry" ) -const ( - nameCronPolicy = `cronworkflows_concurrencypolicy_triggered` -) - func addCronWfPolicyCounter(_ context.Context, m *Metrics) error { - return m.CreateInstrument(telemetry.Int64Counter, - nameCronPolicy, - "Total number of times CronWorkflow concurrencyPolicy has triggered", - "{cronworkflow}", - telemetry.WithAsBuiltIn(), - ) + return m.CreateBuiltinInstrument(telemetry.InstrumentCronworkflowsConcurrencypolicyTriggered) } func (m *Metrics) CronWfPolicy(ctx context.Context, name, namespace string, policy wfv1.ConcurrencyPolicy) { - m.AddInt(ctx, nameCronPolicy, 1, telemetry.InstAttribs{ + m.AddInt(ctx, telemetry.InstrumentCronworkflowsConcurrencypolicyTriggered.Name(), 1, telemetry.InstAttribs{ {Name: telemetry.AttribCronWFName, Value: name}, - {Name: telemetry.AttribWorkflowNamespace, Value: namespace}, + {Name: telemetry.AttribCronWFNamespace, Value: namespace}, {Name: telemetry.AttribConcurrencyPolicy, Value: string(policy)}, }) } diff --git a/workflow/metrics/counter_cronworkflow_trigger.go b/workflow/metrics/counter_cronworkflow_trigger.go index f77c488b6b36..a92333d57aeb 100644 --- a/workflow/metrics/counter_cronworkflow_trigger.go +++ b/workflow/metrics/counter_cronworkflow_trigger.go @@ -6,22 +6,13 @@ import ( "github.com/argoproj/argo-workflows/v3/util/telemetry" ) -const ( - nameCronTriggered = `cronworkflows_triggered_total` -) - func addCronWfTriggerCounter(_ context.Context, m *Metrics) error { - return m.CreateInstrument(telemetry.Int64Counter, - nameCronTriggered, - "Total number of cron workflows triggered", - "{cronworkflow}", - telemetry.WithAsBuiltIn(), - ) + return m.CreateBuiltinInstrument(telemetry.InstrumentCronworkflowsTriggeredTotal) } func (m *Metrics) CronWfTrigger(ctx context.Context, name, namespace string) { - m.AddInt(ctx, nameCronTriggered, 1, telemetry.InstAttribs{ + m.AddInt(ctx, telemetry.InstrumentCronworkflowsTriggeredTotal.Name(), 1, telemetry.InstAttribs{ {Name: telemetry.AttribCronWFName, Value: name}, - {Name: telemetry.AttribWorkflowNamespace, Value: namespace}, + {Name: telemetry.AttribCronWFNamespace, Value: namespace}, }) } diff --git a/workflow/metrics/counter_error.go b/workflow/metrics/counter_error.go index f71f7e6d8315..39a6f1160df5 100644 --- a/workflow/metrics/counter_error.go +++ b/workflow/metrics/counter_error.go @@ -9,37 +9,31 @@ import ( type ErrorCause string const ( - nameErrorCount = `error_count` ErrorCauseOperationPanic ErrorCause = "OperationPanic" ErrorCauseCronWorkflowSubmissionError ErrorCause = "CronWorkflowSubmissionError" ErrorCauseCronWorkflowSpecError ErrorCause = "CronWorkflowSpecError" ) func addErrorCounter(ctx context.Context, m *Metrics) error { - err := m.CreateInstrument(telemetry.Int64Counter, - nameErrorCount, - "Number of errors encountered by the controller by cause", - "{error}", - telemetry.WithAsBuiltIn(), - ) + err := m.CreateBuiltinInstrument(telemetry.InstrumentErrorCount) if err != nil { return err } // Initialise all values to zero for _, cause := range []ErrorCause{ErrorCauseOperationPanic, ErrorCauseCronWorkflowSubmissionError, ErrorCauseCronWorkflowSpecError} { - m.AddInt(ctx, nameErrorCount, 0, telemetry.InstAttribs{{Name: telemetry.AttribErrorCause, Value: string(cause)}}) + m.AddInt(ctx, telemetry.InstrumentErrorCount.Name(), 0, telemetry.InstAttribs{{Name: telemetry.AttribErrorCause, Value: string(cause)}}) } return nil } func (m *Metrics) OperationPanic(ctx context.Context) { - m.AddInt(ctx, nameErrorCount, 1, telemetry.InstAttribs{{Name: telemetry.AttribErrorCause, Value: string(ErrorCauseOperationPanic)}}) + m.AddInt(ctx, telemetry.InstrumentErrorCount.Name(), 1, telemetry.InstAttribs{{Name: telemetry.AttribErrorCause, Value: string(ErrorCauseOperationPanic)}}) } func (m *Metrics) CronWorkflowSubmissionError(ctx context.Context) { - m.AddInt(ctx, nameErrorCount, 1, telemetry.InstAttribs{{Name: telemetry.AttribErrorCause, Value: string(ErrorCauseCronWorkflowSubmissionError)}}) + m.AddInt(ctx, telemetry.InstrumentErrorCount.Name(), 1, telemetry.InstAttribs{{Name: telemetry.AttribErrorCause, Value: string(ErrorCauseCronWorkflowSubmissionError)}}) } func (m *Metrics) CronWorkflowSpecError(ctx context.Context) { - m.AddInt(ctx, nameErrorCount, 1, telemetry.InstAttribs{{Name: telemetry.AttribErrorCause, Value: string(ErrorCauseCronWorkflowSpecError)}}) + m.AddInt(ctx, telemetry.InstrumentErrorCount.Name(), 1, telemetry.InstAttribs{{Name: telemetry.AttribErrorCause, Value: string(ErrorCauseCronWorkflowSpecError)}}) } diff --git a/workflow/metrics/counter_log.go b/workflow/metrics/counter_log.go index b9cea55952ab..dd3a37061dba 100644 --- a/workflow/metrics/counter_log.go +++ b/workflow/metrics/counter_log.go @@ -13,19 +13,14 @@ type logMetric struct { } func addLogCounter(ctx context.Context, m *Metrics) error { - const nameLogMessages = `log_messages` - err := m.CreateInstrument(telemetry.Int64Counter, - nameLogMessages, - "Total number of log messages.", - "{message}", - telemetry.WithAsBuiltIn(), - ) + err := m.CreateBuiltinInstrument(telemetry.InstrumentLogMessages) + name := telemetry.InstrumentLogMessages.Name() lm := logMetric{ - counter: m.AllInstruments[nameLogMessages], + counter: m.AllInstruments[name], } log.AddHook(lm) for _, level := range lm.Levels() { - m.AddInt(ctx, nameLogMessages, 0, telemetry.InstAttribs{ + m.AddInt(ctx, name, 0, telemetry.InstAttribs{ {Name: telemetry.AttribLogLevel, Value: level.String()}, }) } diff --git a/workflow/metrics/counter_pod_missing.go b/workflow/metrics/counter_pod_missing.go index 8f3b227d8ef0..6b51cd2ef98d 100644 --- a/workflow/metrics/counter_pod_missing.go +++ b/workflow/metrics/counter_pod_missing.go @@ -6,21 +6,12 @@ import ( "github.com/argoproj/argo-workflows/v3/util/telemetry" ) -const ( - namePodMissing = `pod_missing` -) - func addPodMissingCounter(_ context.Context, m *Metrics) error { - return m.CreateInstrument(telemetry.Int64Counter, - namePodMissing, - "Incidents of pod missing.", - "{pod}", - telemetry.WithAsBuiltIn(), - ) + return m.CreateBuiltinInstrument(telemetry.InstrumentPodMissing) } func (m *Metrics) incPodMissing(ctx context.Context, val int64, recentlyStarted bool, phase string) { - m.AddInt(ctx, namePodMissing, val, telemetry.InstAttribs{ + m.AddInt(ctx, telemetry.InstrumentPodMissing.Name(), val, telemetry.InstAttribs{ {Name: telemetry.AttribRecentlyStarted, Value: recentlyStarted}, {Name: telemetry.AttribNodePhase, Value: phase}, }) diff --git a/workflow/metrics/counter_pod_pending.go b/workflow/metrics/counter_pod_pending.go index 5138df86abea..d9f485b28e32 100644 --- a/workflow/metrics/counter_pod_pending.go +++ b/workflow/metrics/counter_pod_pending.go @@ -7,17 +7,8 @@ import ( "github.com/argoproj/argo-workflows/v3/util/telemetry" ) -const ( - namePodPending = `pod_pending_count` -) - func addPodPendingCounter(_ context.Context, m *Metrics) error { - return m.CreateInstrument(telemetry.Int64Counter, - namePodPending, - "Total number of pods that started pending by reason", - "{pod}", - telemetry.WithAsBuiltIn(), - ) + return m.CreateBuiltinInstrument(telemetry.InstrumentPodPendingCount) } func (m *Metrics) ChangePodPending(ctx context.Context, reason, namespace string) { @@ -30,7 +21,7 @@ func (m *Metrics) ChangePodPending(ctx context.Context, reason, namespace string // the pod_phase metric can cope with this being visible return default: - m.AddInt(ctx, namePodPending, 1, telemetry.InstAttribs{ + m.AddInt(ctx, telemetry.InstrumentPodPendingCount.Name(), 1, telemetry.InstAttribs{ {Name: telemetry.AttribPodPendingReason, Value: splitReason[0]}, {Name: telemetry.AttribPodNamespace, Value: namespace}, }) diff --git a/workflow/metrics/counter_pod_phase.go b/workflow/metrics/counter_pod_phase.go index 37686c5fc4c3..1b644d389ae2 100644 --- a/workflow/metrics/counter_pod_phase.go +++ b/workflow/metrics/counter_pod_phase.go @@ -6,21 +6,12 @@ import ( "github.com/argoproj/argo-workflows/v3/util/telemetry" ) -const ( - namePodPhase = `pods_total_count` -) - func addPodPhaseCounter(_ context.Context, m *Metrics) error { - return m.CreateInstrument(telemetry.Int64Counter, - namePodPhase, - "Total number of Pods that have entered each phase", - "{pod}", - telemetry.WithAsBuiltIn(), - ) + return m.CreateBuiltinInstrument(telemetry.InstrumentPodsTotalCount) } func (m *Metrics) ChangePodPhase(ctx context.Context, phase, namespace string) { - m.AddInt(ctx, namePodPhase, 1, telemetry.InstAttribs{ + m.AddInt(ctx, telemetry.InstrumentPodsTotalCount.Name(), 1, telemetry.InstAttribs{ {Name: telemetry.AttribPodPhase, Value: phase}, {Name: telemetry.AttribPodNamespace, Value: namespace}, }) diff --git a/workflow/metrics/counter_template.go b/workflow/metrics/counter_template.go index f1cb3500cab3..f7f8a6a8b37e 100644 --- a/workflow/metrics/counter_template.go +++ b/workflow/metrics/counter_template.go @@ -6,17 +6,8 @@ import ( "github.com/argoproj/argo-workflows/v3/util/telemetry" ) -const ( - nameWFTemplateTriggered = `workflowtemplate_triggered_total` -) - func addWorkflowTemplateCounter(_ context.Context, m *Metrics) error { - return m.CreateInstrument(telemetry.Int64Counter, - nameWFTemplateTriggered, - "Total number of workflow templates triggered by workflowTemplateRef", - "{workflow_template}", - telemetry.WithAsBuiltIn(), - ) + return m.CreateBuiltinInstrument(telemetry.InstrumentWorkflowtemplateTriggeredTotal) } func templateAttribs(name, namespace string, cluster bool) telemetry.InstAttribs { @@ -31,5 +22,5 @@ func (m *Metrics) CountWorkflowTemplate(ctx context.Context, phase MetricWorkflo attribs := templateAttribs(name, namespace, cluster) attribs = append(attribs, telemetry.InstAttrib{Name: telemetry.AttribWorkflowPhase, Value: string(phase)}) - m.AddInt(ctx, nameWFTemplateTriggered, 1, attribs) + m.AddInt(ctx, telemetry.InstrumentWorkflowtemplateTriggeredTotal.Name(), 1, attribs) } diff --git a/workflow/metrics/counter_workflow_phase.go b/workflow/metrics/counter_workflow_phase.go index 91c56eef4853..b95439aaa0c3 100644 --- a/workflow/metrics/counter_workflow_phase.go +++ b/workflow/metrics/counter_workflow_phase.go @@ -7,10 +7,6 @@ import ( "github.com/argoproj/argo-workflows/v3/util/telemetry" ) -const ( - nameWorkflowPhaseCounter = `total_count` -) - type MetricWorkflowPhase string const ( @@ -41,16 +37,11 @@ func ConvertWorkflowPhase(inPhase wfv1.WorkflowPhase) MetricWorkflowPhase { } func addWorkflowPhaseCounter(_ context.Context, m *Metrics) error { - return m.CreateInstrument(telemetry.Int64Counter, - nameWorkflowPhaseCounter, - "Total number of workflows that have entered each phase", - "{workflow}", - telemetry.WithAsBuiltIn(), - ) + return m.CreateBuiltinInstrument(telemetry.InstrumentTotalCount) } func (m *Metrics) ChangeWorkflowPhase(ctx context.Context, phase MetricWorkflowPhase, namespace string) { - m.AddInt(ctx, nameWorkflowPhaseCounter, 1, telemetry.InstAttribs{ + m.AddInt(ctx, telemetry.InstrumentTotalCount.Name(), 1, telemetry.InstAttribs{ {Name: telemetry.AttribWorkflowPhase, Value: string(phase)}, {Name: telemetry.AttribWorkflowNamespace, Value: namespace}, }) diff --git a/workflow/metrics/gauge_pod_phase.go b/workflow/metrics/gauge_pod_phase.go index b93e9c33729b..ac401ca20ccf 100644 --- a/workflow/metrics/gauge_pod_phase.go +++ b/workflow/metrics/gauge_pod_phase.go @@ -17,23 +17,18 @@ type podPhaseGauge struct { } func addPodPhaseGauge(ctx context.Context, m *Metrics) error { - const namePodsPhase = `pods_gauge` - err := m.CreateInstrument(telemetry.Int64ObservableGauge, - namePodsPhase, - "Number of Pods from Workflows currently accessible by the controller by status.", - "{pod}", - telemetry.WithAsBuiltIn(), - ) + err := m.CreateBuiltinInstrument(telemetry.InstrumentPodsGauge) if err != nil { return err } + name := telemetry.InstrumentPodsGauge.Name() if m.callbacks.PodPhase != nil { ppGauge := podPhaseGauge{ callback: m.callbacks.PodPhase, - gauge: m.AllInstruments[namePodsPhase], + gauge: m.AllInstruments[name], } - return m.AllInstruments[namePodsPhase].RegisterCallback(m.Metrics, ppGauge.update) + return m.AllInstruments[name].RegisterCallback(m.Metrics, ppGauge.update) } return nil } diff --git a/workflow/metrics/gauge_workflow_condition.go b/workflow/metrics/gauge_workflow_condition.go index 3709cd07d9d1..0c174f6456a0 100644 --- a/workflow/metrics/gauge_workflow_condition.go +++ b/workflow/metrics/gauge_workflow_condition.go @@ -18,13 +18,7 @@ type workflowConditionGauge struct { } func addWorkflowConditionGauge(_ context.Context, m *Metrics) error { - const nameWorkflowCondition = `workflow_condition` - err := m.CreateInstrument(telemetry.Int64ObservableGauge, - nameWorkflowCondition, - "Workflow condition.", - "{unit}", - telemetry.WithAsBuiltIn(), - ) + err := m.CreateBuiltinInstrument(telemetry.InstrumentWorkflowCondition) if err != nil { return err } @@ -32,9 +26,9 @@ func addWorkflowConditionGauge(_ context.Context, m *Metrics) error { if m.callbacks.WorkflowCondition != nil { wfcGauge := workflowConditionGauge{ callback: m.callbacks.WorkflowCondition, - gauge: m.AllInstruments[nameWorkflowCondition], + gauge: m.AllInstruments[telemetry.InstrumentWorkflowCondition.Name()], } - return m.AllInstruments[nameWorkflowCondition].RegisterCallback(m.Metrics, wfcGauge.update) + return m.AllInstruments[telemetry.InstrumentWorkflowCondition.Name()].RegisterCallback(m.Metrics, wfcGauge.update) } return nil // TODO init all phases? diff --git a/workflow/metrics/gauge_workflow_phase.go b/workflow/metrics/gauge_workflow_phase.go index 59a6e670e413..997d7088f084 100644 --- a/workflow/metrics/gauge_workflow_phase.go +++ b/workflow/metrics/gauge_workflow_phase.go @@ -17,23 +17,18 @@ type workflowPhaseGauge struct { } func addWorkflowPhaseGauge(_ context.Context, m *Metrics) error { - const nameWorkflowPhaseGauge = `gauge` - err := m.CreateInstrument(telemetry.Int64ObservableGauge, - nameWorkflowPhaseGauge, - "number of Workflows currently accessible by the controller by status", - "{workflow}", - telemetry.WithAsBuiltIn(), - ) + err := m.CreateBuiltinInstrument(telemetry.InstrumentGauge) if err != nil { return err } + name := telemetry.InstrumentGauge.Name() if m.callbacks.WorkflowPhase != nil { wfpGauge := workflowPhaseGauge{ callback: m.callbacks.WorkflowPhase, - gauge: m.AllInstruments[nameWorkflowPhaseGauge], + gauge: m.AllInstruments[name], } - return m.AllInstruments[nameWorkflowPhaseGauge].RegisterCallback(m.Metrics, wfpGauge.update) + return m.AllInstruments[name].RegisterCallback(m.Metrics, wfpGauge.update) } return nil // TODO init all phases? diff --git a/workflow/metrics/histogram_durations.go b/workflow/metrics/histogram_durations.go index 6b6038e31ff2..43e8979d4761 100644 --- a/workflow/metrics/histogram_durations.go +++ b/workflow/metrics/histogram_durations.go @@ -13,7 +13,6 @@ import ( ) const ( - nameOperationDuration = `operation_duration_seconds` operationDurationDefaultBucketCount = 6 ) @@ -26,15 +25,10 @@ func addOperationDurationHistogram(_ context.Context, m *Metrics) error { } bucketWidth := maxOperationTimeSeconds / float64(operationDurationMetricBucketCount) // The buckets here are only the 'defaults' and can be overridden with configmap defaults - return m.CreateInstrument(telemetry.Float64Histogram, - nameOperationDuration, - "Histogram of durations of operations", - "s", - telemetry.WithDefaultBuckets(prometheus.LinearBuckets(bucketWidth, bucketWidth, operationDurationMetricBucketCount)), - telemetry.WithAsBuiltIn(), - ) + return m.CreateBuiltinInstrument(telemetry.InstrumentOperationDurationSeconds, + telemetry.WithDefaultBuckets(prometheus.LinearBuckets(bucketWidth, bucketWidth, operationDurationMetricBucketCount))) } func (m *Metrics) OperationCompleted(ctx context.Context, durationSeconds float64) { - m.Record(ctx, nameOperationDuration, durationSeconds, telemetry.InstAttribs{}) + m.Record(ctx, telemetry.InstrumentOperationDurationSeconds.Name(), durationSeconds, telemetry.InstAttribs{}) } diff --git a/workflow/metrics/histogram_template.go b/workflow/metrics/histogram_template.go index a466b9112fa3..bb87145a71cf 100644 --- a/workflow/metrics/histogram_template.go +++ b/workflow/metrics/histogram_template.go @@ -7,19 +7,10 @@ import ( "github.com/argoproj/argo-workflows/v3/util/telemetry" ) -const ( - nameWorkflowTemplateRuntime = `workflowtemplate_runtime` -) - func addWorkflowTemplateHistogram(_ context.Context, m *Metrics) error { - return m.CreateInstrument(telemetry.Float64Histogram, - nameWorkflowTemplateRuntime, - "Duration of workflow template runs run through workflowTemplateRefs", - "s", - telemetry.WithAsBuiltIn(), - ) + return m.CreateBuiltinInstrument(telemetry.InstrumentWorkflowtemplateRuntime) } func (m *Metrics) RecordWorkflowTemplateTime(ctx context.Context, duration time.Duration, name, namespace string, cluster bool) { - m.Record(ctx, nameWorkflowTemplateRuntime, duration.Seconds(), templateAttribs(name, namespace, cluster)) + m.Record(ctx, telemetry.InstrumentWorkflowtemplateRuntime.Name(), duration.Seconds(), templateAttribs(name, namespace, cluster)) } diff --git a/workflow/metrics/leader.go b/workflow/metrics/leader.go index ff3562b66b52..e91dc6c9c8a9 100644 --- a/workflow/metrics/leader.go +++ b/workflow/metrics/leader.go @@ -16,24 +16,19 @@ type leaderGauge struct { } func addIsLeader(ctx context.Context, m *Metrics) error { - const nameLeader = `is_leader` - err := m.CreateInstrument(telemetry.Int64ObservableGauge, - nameLeader, - "Emits 1 if leader, 0 otherwise. Always 1 if leader election is disabled.", - "{leader}", - telemetry.WithAsBuiltIn(), - ) + err := m.CreateBuiltinInstrument(telemetry.InstrumentIsLeader) if err != nil { return err } if m.callbacks.IsLeader == nil { return nil } + name := telemetry.InstrumentIsLeader.Name() lGauge := leaderGauge{ callback: m.callbacks.IsLeader, - gauge: m.AllInstruments[nameLeader], + gauge: m.AllInstruments[name], } - return m.AllInstruments[nameLeader].RegisterCallback(m.Metrics, lGauge.update) + return m.AllInstruments[name].RegisterCallback(m.Metrics, lGauge.update) } func (l *leaderGauge) update(_ context.Context, o metric.Observer) error { diff --git a/workflow/metrics/leader_test.go b/workflow/metrics/leader_test.go index 76514ab4fa51..0d7c59c7c32e 100644 --- a/workflow/metrics/leader_test.go +++ b/workflow/metrics/leader_test.go @@ -22,7 +22,7 @@ func TestIsLeader(t *testing.T) { require.NoError(t, err) assert.NotNil(t, te) attribs := attribute.NewSet() - val, err := te.GetInt64GaugeValue(`is_leader`, &attribs) + val, err := te.GetInt64GaugeValue(telemetry.InstrumentIsLeader.Name(), &attribs) require.NoError(t, err) assert.Equal(t, int64(1), val) } @@ -38,7 +38,7 @@ func TestNotLeader(t *testing.T) { require.NoError(t, err) assert.NotNil(t, te) attribs := attribute.NewSet() - val, err := te.GetInt64GaugeValue(`is_leader`, &attribs) + val, err := te.GetInt64GaugeValue(telemetry.InstrumentIsLeader.Name(), &attribs) require.NoError(t, err) assert.Equal(t, int64(0), val) } diff --git a/workflow/metrics/metrics_k8s_request.go b/workflow/metrics/metrics_k8s_request.go index 3294260823aa..29bdf769793f 100644 --- a/workflow/metrics/metrics_k8s_request.go +++ b/workflow/metrics/metrics_k8s_request.go @@ -11,28 +11,12 @@ import ( "github.com/argoproj/argo-workflows/v3/util/telemetry" ) -const ( - nameK8sRequestTotal = `k8s_request_total` - nameK8sRequestDuration = `k8s_request_duration` -) - func addK8sRequests(_ context.Context, m *Metrics) error { - err := m.CreateInstrument(telemetry.Int64Counter, - nameK8sRequestTotal, - "Number of kubernetes requests executed.", - "{request}", - telemetry.WithAsBuiltIn(), - ) + err := m.CreateBuiltinInstrument(telemetry.InstrumentK8sRequestTotal) if err != nil { return err } - err = m.CreateInstrument(telemetry.Float64Histogram, - nameK8sRequestDuration, - "Duration of kubernetes requests executed.", - "s", - telemetry.WithDefaultBuckets([]float64{0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0, 60.0, 180.0}), - telemetry.WithAsBuiltIn(), - ) + err = m.CreateBuiltinInstrument(telemetry.InstrumentK8sRequestDuration) // Register this metrics with the global k8sMetrics.metrics = m return err @@ -63,8 +47,8 @@ func (m metricsRoundTripper) RoundTrip(r *http.Request) (*http.Response, error) {Name: telemetry.AttribRequestVerb, Value: verb}, {Name: telemetry.AttribRequestCode, Value: x.StatusCode}, } - (*m.metrics).AddInt(m.ctx, nameK8sRequestTotal, 1, attribs) - (*m.metrics).Record(m.ctx, nameK8sRequestDuration, duration.Seconds(), attribs) + (*m.metrics).AddInt(m.ctx, telemetry.InstrumentK8sRequestTotal.Name(), 1, attribs) + (*m.metrics).Record(m.ctx, telemetry.InstrumentK8sRequestDuration.Name(), duration.Seconds(), attribs) } return x, err } diff --git a/workflow/metrics/metrics_test.go b/workflow/metrics/metrics_test.go index c8fbda52ab6b..a5e789579db8 100644 --- a/workflow/metrics/metrics_test.go +++ b/workflow/metrics/metrics_test.go @@ -22,7 +22,7 @@ func TestMetrics(t *testing.T) { m.OperationCompleted(m.Ctx, 5) assert.NotNil(t, te) attribs := attribute.NewSet() - val, err := te.GetFloat64HistogramData(nameOperationDuration, &attribs) + val, err := te.GetFloat64HistogramData(telemetry.InstrumentOperationDurationSeconds.Name(), &attribs) require.NoError(t, err) assert.Equal(t, []float64{5, 10, 15, 20, 25, 30}, val.Bounds) assert.Equal(t, []uint64{1, 0, 0, 0, 0, 0, 0}, val.BucketCounts) diff --git a/workflow/metrics/work_queue.go b/workflow/metrics/work_queue.go index 537643e719cc..0feb6f677181 100644 --- a/workflow/metrics/work_queue.go +++ b/workflow/metrics/work_queue.go @@ -11,17 +11,6 @@ import ( "k8s.io/utils/ptr" ) -const ( - nameWorkersBusy = `workers_busy_count` - nameWorkersQueueDepth = `queue_depth_gauge` - nameWorkersQueueAdds = `queue_adds_count` - nameWorkersQueueLatency = `queue_latency` - nameWorkersQueueDuration = `queue_duration` - nameWorkersRetries = `queue_retries` - nameWorkersUnfinishedWork = `queue_unfinished_work` - nameWorkersLongestRunning = `queue_longest_running` -) - // Act as a metrics provider for a workqueues var _ workqueue.MetricsProvider = &Metrics{} @@ -34,94 +23,51 @@ type workersBusyRateLimiterWorkQueue struct { } func addWorkQueueMetrics(_ context.Context, m *Metrics) error { - err := m.CreateInstrument(telemetry.Int64UpDownCounter, - nameWorkersBusy, - "Number of workers currently busy", - "{worker}", - telemetry.WithAsBuiltIn(), - ) + err := m.CreateBuiltinInstrument(telemetry.InstrumentWorkersBusyCount) if err != nil { return err } - err = m.CreateInstrument(telemetry.Int64UpDownCounter, - nameWorkersQueueDepth, - "Depth of the queue", - "{item}", - telemetry.WithAsBuiltIn(), - ) + err = m.CreateBuiltinInstrument(telemetry.InstrumentQueueDepthGauge) if err != nil { return err } - err = m.CreateInstrument(telemetry.Int64Counter, - nameWorkersQueueAdds, - "Adds to the queue", - "{item}", - telemetry.WithAsBuiltIn(), - ) + err = m.CreateBuiltinInstrument(telemetry.InstrumentQueueAddsCount) if err != nil { return err } - err = m.CreateInstrument(telemetry.Float64Histogram, - nameWorkersQueueLatency, - "Time objects spend waiting in the queue", - "s", - telemetry.WithDefaultBuckets([]float64{1.0, 5.0, 20.0, 60.0, 180.0}), - telemetry.WithAsBuiltIn(), - ) + err = m.CreateBuiltinInstrument(telemetry.InstrumentQueueLatency) if err != nil { return err } - err = m.CreateInstrument(telemetry.Float64Histogram, - nameWorkersQueueDuration, - "Time objects spend being processed from the queue", - "s", - telemetry.WithDefaultBuckets([]float64{0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0, 60.0, 180.0}), - telemetry.WithAsBuiltIn(), - ) + err = m.CreateBuiltinInstrument(telemetry.InstrumentQueueDuration) if err != nil { return err } - err = m.CreateInstrument(telemetry.Int64Counter, - nameWorkersRetries, - "Retries in the queues", - "{item}", - telemetry.WithAsBuiltIn(), - ) + err = m.CreateBuiltinInstrument(telemetry.InstrumentQueueRetries) if err != nil { return err } - err = m.CreateInstrument(telemetry.Float64ObservableGauge, - nameWorkersUnfinishedWork, - "Unfinished work time", - "s", - telemetry.WithAsBuiltIn(), - ) + err = m.CreateBuiltinInstrument(telemetry.InstrumentQueueUnfinishedWork) if err != nil { return err } unfinishedCallback := queueUserdata{ - gauge: m.AllInstruments[nameWorkersUnfinishedWork], - } - m.AllInstruments[nameWorkersUnfinishedWork].SetUserdata(&unfinishedCallback) - err = m.AllInstruments[nameWorkersUnfinishedWork].RegisterCallback(m.Metrics, unfinishedCallback.update) + gauge: m.AllInstruments[telemetry.InstrumentQueueUnfinishedWork.Name()]} + m.AllInstruments[telemetry.InstrumentQueueUnfinishedWork.Name()].SetUserdata(&unfinishedCallback) + err = m.AllInstruments[telemetry.InstrumentQueueUnfinishedWork.Name()].RegisterCallback(m.Metrics, unfinishedCallback.update) if err != nil { return err } - err = m.CreateInstrument(telemetry.Float64ObservableGauge, - nameWorkersLongestRunning, - "Longest running worker", - "s", - telemetry.WithAsBuiltIn(), - ) + err = m.CreateBuiltinInstrument(telemetry.InstrumentQueueLongestRunning) if err != nil { return err } longestRunningCallback := queueUserdata{ - gauge: m.AllInstruments[nameWorkersLongestRunning], + gauge: m.AllInstruments[telemetry.InstrumentQueueLongestRunning.Name()], } - m.AllInstruments[nameWorkersLongestRunning].SetUserdata(&longestRunningCallback) - err = m.AllInstruments[nameWorkersLongestRunning].RegisterCallback(m.Metrics, longestRunningCallback.update) + m.AllInstruments[telemetry.InstrumentQueueLongestRunning.Name()].SetUserdata(&longestRunningCallback) + err = m.AllInstruments[telemetry.InstrumentQueueLongestRunning.Name()].RegisterCallback(m.Metrics, longestRunningCallback.update) if err != nil { return err } @@ -132,7 +78,7 @@ func (m *Metrics) RateLimiterWithBusyWorkers(ctx context.Context, workQueue work queue := workersBusyRateLimiterWorkQueue{ RateLimitingInterface: workqueue.NewNamedRateLimitingQueue(workQueue, queueName), workerType: queueName, - busyGauge: m.AllInstruments[nameWorkersBusy], + busyGauge: m.AllInstruments[telemetry.InstrumentWorkersBusyCount.Name()], ctx: ctx, } queue.newWorker(ctx) @@ -221,7 +167,7 @@ func (m *Metrics) NewDepthMetric(name string) workqueue.GaugeMetric { return queueMetric{ ctx: m.Ctx, name: name, - inst: m.AllInstruments[nameWorkersQueueDepth], + inst: m.AllInstruments[telemetry.InstrumentQueueDepthGauge.Name()], } } @@ -229,7 +175,7 @@ func (m *Metrics) NewAddsMetric(name string) workqueue.CounterMetric { return queueMetric{ ctx: m.Ctx, name: name, - inst: m.AllInstruments[nameWorkersQueueAdds], + inst: m.AllInstruments[telemetry.InstrumentQueueAddsCount.Name()], } } @@ -237,7 +183,7 @@ func (m *Metrics) NewLatencyMetric(name string) workqueue.HistogramMetric { return queueMetric{ ctx: m.Ctx, name: name, - inst: m.AllInstruments[nameWorkersQueueLatency], + inst: m.AllInstruments[telemetry.InstrumentQueueLatency.Name()], } } @@ -245,7 +191,7 @@ func (m *Metrics) NewWorkDurationMetric(name string) workqueue.HistogramMetric { return queueMetric{ ctx: m.Ctx, name: name, - inst: m.AllInstruments[nameWorkersQueueDuration], + inst: m.AllInstruments[telemetry.InstrumentQueueDuration.Name()], } } @@ -253,7 +199,7 @@ func (m *Metrics) NewRetriesMetric(name string) workqueue.CounterMetric { return queueMetric{ ctx: m.Ctx, name: name, - inst: m.AllInstruments[nameWorkersRetries], + inst: m.AllInstruments[telemetry.InstrumentQueueRetries.Name()], } } @@ -261,7 +207,7 @@ func (m *Metrics) NewUnfinishedWorkSecondsMetric(name string) workqueue.Settable metric := queueMetric{ ctx: m.Ctx, name: name, - inst: m.AllInstruments[nameWorkersUnfinishedWork], + inst: m.AllInstruments[telemetry.InstrumentQueueUnfinishedWork.Name()], value: ptr.To(float64(0.0)), } ud := getQueueUserdata(metric.inst) @@ -273,7 +219,7 @@ func (m *Metrics) NewLongestRunningProcessorSecondsMetric(name string) workqueue metric := queueMetric{ ctx: m.Ctx, name: name, - inst: m.AllInstruments[nameWorkersLongestRunning], + inst: m.AllInstruments[telemetry.InstrumentQueueLongestRunning.Name()], value: ptr.To(float64(0.0)), } ud := getQueueUserdata(metric.inst)