diff --git a/Makefile b/Makefile index c51b9e3d77e7..ba67f3323078 100644 --- a/Makefile +++ b/Makefile @@ -460,7 +460,7 @@ lint: ui/dist/app/index.html $(GOPATH)/bin/golangci-lint # for local we have a faster target that prints to stdout, does not use json, and can cache because it has no coverage .PHONY: test -test: ui/dist/app/index.html +test: ui/dist/app/index.html util/telemetry/metrics_list.go util/telemetry/attributes.go go build ./... env KUBECONFIG=/dev/null $(GOTEST) ./... # marker file, based on it's modification time, we know how long ago this target was run @@ -621,8 +621,21 @@ clean: go clean rm -Rf test-results node_modules vendor v2 v3 argoexec-linux-amd64 dist/* ui/dist -# swagger +# Build telemetry files +TELEMETRY_BUILDER := $(shell find util/telemetry/builder -type f -name '*.go') +docs/metrics.md: $(TELEMETRY_BUILDER) util/telemetry/builder/values.yaml + @echo Rebuilding $@ + go run ./util/telemetry/builder --metricsDocs $@ + +util/telemetry/metrics_list.go: $(TELEMETRY_BUILDER) util/telemetry/builder/values.yaml + @echo Rebuilding $@ + go run ./util/telemetry/builder --metricsListGo $@ +util/telemetry/attributes.go: $(TELEMETRY_BUILDER) util/telemetry/builder/values.yaml + @echo Rebuilding $@ + go run ./util/telemetry/builder --attributesGo $@ + +# swagger pkg/apis/workflow/v1alpha1/openapi_generated.go: $(GOPATH)/bin/openapi-gen $(TYPES) # These files are generated on a v3/ folder by the tool. Link them to the root folder [ -e ./v3 ] || ln -s . v3 @@ -707,7 +720,7 @@ ifneq ($(USE_NIX), true) endif .PHONY: docs-spellcheck -docs-spellcheck: /usr/local/bin/mdspell +docs-spellcheck: /usr/local/bin/mdspell docs/metrics.md # check docs for spelling mistakes mdspell --ignore-numbers --ignore-acronyms --en-us --no-suggestions --report $(shell find docs -name '*.md' -not -name upgrading.md -not -name README.md -not -name fields.md -not -name upgrading.md -not -name executor_swagger.md -not -path '*/cli/*') # alphabetize spelling file -- ignore first line (comment), then sort the rest case-sensitive and remove duplicates @@ -732,7 +745,7 @@ endif .PHONY: docs-lint -docs-lint: /usr/local/bin/markdownlint +docs-lint: /usr/local/bin/markdownlint docs/metrics.md # lint docs markdownlint docs --fix --ignore docs/fields.md --ignore docs/executor_swagger.md --ignore docs/cli --ignore docs/walk-through/the-structure-of-workflow-specs.md diff --git a/docs/metrics.md b/docs/metrics.md index a1cd4adb586f..fd3d0b000733 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -242,38 +242,35 @@ Metrics for the [Four Golden Signals](https://sre.google/sre-book/monitoring-dis !!! Warning "High cardinality" Some metric attributes may have high cardinality and are marked with ⚠️ to warn you. You may need to disable this metric or disable the attribute. - - + #### `cronworkflows_concurrencypolicy_triggered` A counter of the number of times a CronWorkflow has triggered its `concurrencyPolicy` to limit the number of workflows running. - -| attribute | explanation | -|-------------|-------------------------------------------| -| `name` | ⚠️ The name of the CronWorkflow | -| `namespace` | The namespace of the CronWorkflow | +| attribute | explanation | +|----------------------|----------------------------------------------------------------------------------| +| `name` | ⚠️ The name of the CronWorkflow | +| `namespace` | The namespace that the CronWorkflow is in | | `concurrency_policy` | The concurrency policy which was triggered, will be either `Forbid` or `Replace` | #### `cronworkflows_triggered_total` -A counter of the number of times a CronWorkflow has been triggered. +A counter of the total number of times a CronWorkflow has been triggered. Suppressed runs due to `concurrencyPolicy: Forbid` will not be counted. - -| attribute | explanation | +| attribute | explanation | |-------------|-------------------------------------------| -| `name` | ⚠️ The name of the CronWorkflow | -| `namespace` | The namespace of the CronWorkflow | +| `name` | ⚠️ The name of the CronWorkflow | +| `namespace` | The namespace that the CronWorkflow is in | #### `deprecated_feature` -A counter which goes up when a feature which is [deprecated](deprecations.md) is used. +Incidents of deprecated feature being used. +Deprecated features are [explained here](deprecations.md). 🚨 This counter may go up much more than once for a single use of the feature. - -| attribute | explanation | -|-------------|---------------------------------------------| -| `feature` | The name of the feature used | -| `namespace` | The namespace of the item using the feature | +| attribute | explanation | +|-------------|---------------------------------------| +| `feature` | The name of the feature used | +| `namespace` | The namespace that the Workflow is in | `feature` will be one of: @@ -282,129 +279,123 @@ A counter which goes up when a feature which is [deprecated](deprecations.md) is - [`synchronization semaphore`](deprecations.md#synchronization_semaphore) - [`workflow podpriority`](deprecations.md#workflow_podpriority) -#### `gauge` - -A gauge of the number of workflows currently in the cluster in each phase. The `Running` count does not mean that a workflows pods are running, just that the controller has scheduled them. A workflow can be stuck in `Running` with pending pods for a long time. - -| attribute | explanation | -|-----------|-----------------------------------| -| `status` | The phase that the workflow is in | - #### `error_count` -A counter of certain errors incurred by the controller. - -| attribute | explanation | +A counter of certain errors incurred by the controller by cause. +| attribute | explanation | |-----------|------------------------| -| `cause` | The cause of the error | +| `cause` | The cause of the error | The currently tracked specific errors are -- `OperationPanic` - the controller `panic()` on a programming bug +- `OperationPanic` - the controller called `panic()` on encountering a programming bug - `CronWorkflowSubmissionError` - A CronWorkflow failed submission - `CronWorkflowSpecError` - A CronWorkflow has an invalid specification -#### `k8s_request_total` +#### `gauge` -A counter of the number of API requests sent to the Kubernetes API. +A gauge of the number of workflows currently in the cluster in each phase. +The `Running` count does not mean that a workflows pods are running, just that the controller has scheduled them. +A workflow can be stuck in `Running` with pending pods for a long time. +| attribute | explanation | +|-----------|----------------------------| +| `status` | Boolean: `true` or `false` | -| attribute | explanation | -|---------------|--------------------------------------------------------------------| -| `kind` | The kubernetes `kind` involved in the request such as `configmaps` | -| `verb` | The verb of the request, such as `Get` or `List` | -| `status_code` | The HTTP status code of the response | +#### `is_leader` -This metric is calculable from `k8s_request_duration`, and it is suggested you just collect that metric instead. +Emits 1 if leader, 0 otherwise. Always 1 if leader election is disabled. +A gauge indicating if this Controller is the [leader](high-availability.md#workflow-controller). -#### `k8s_request_duration` +- `1` if leader or in standalone mode via [`LEADER_ELECTION_DISABLE=true`](environment-variables.md#controller). +- `0` otherwise, indicating that this controller is a standby that is not currently running workflows. +This metric has no attributes. -A histogram recording how long each type of request took. +#### `k8s_request_duration` -| attribute | explanation | +A histogram recording the API requests sent to the Kubernetes API. +| attribute | explanation | |---------------|--------------------------------------------------------------------| | `kind` | The kubernetes `kind` involved in the request such as `configmaps` | | `verb` | The verb of the request, such as `Get` or `List` | | `status_code` | The HTTP status code of the response | -This is contains all the information contained in `k8s_request_total` along with timings. +Default bucket sizes: 0.1, 0.2, 0.5, 1, 2, 5, 10, 20, 60, 180 +This contains all the information contained in `k8s_request_total` along with timings. -#### `is_leader` +#### `k8s_request_total` -A gauge indicating if this Controller is the [leader](high-availability.md#workflow-controller). +A counter of the number of API requests sent to the Kubernetes API. +| attribute | explanation | +|---------------|--------------------------------------------------------------------| +| `kind` | The kubernetes `kind` involved in the request such as `configmaps` | +| `verb` | The verb of the request, such as `Get` or `List` | +| `status_code` | The HTTP status code of the response | -- `1` if leader or in standalone mode via [`LEADER_ELECTION_DISABLE=true`](environment-variables.md#controller). -- `0` otherwise, indicating that this controller is a standby that is not currently running workflows. +This metric is calculable from `k8s_request_duration`, and it is suggested you just collect that metric instead. #### `log_messages` A count of log messages emitted by the controller by log level: `error`, `warn` and `info`. - -| attribute | explanation | +| attribute | explanation | |-----------|------------------------------| | `level` | The log level of the message | #### `operation_duration_seconds` -A histogram of durations of operations. An operation is a single workflow reconciliation loop within the workflow-controller. +A histogram of durations of operations. +An operation is a single workflow reconciliation loop within the workflow-controller. It's the time for the controller to process a single workflow after it has been read from the cluster and is a measure of the performance of the controller affected by the complexity of the workflow. - This metric has no attributes. - The environment variables `OPERATION_DURATION_METRIC_BUCKET_COUNT` and `MAX_OPERATION_TIME` configure the bucket sizes for this metric, unless they are specified using an `histogramBuckets` modifier in the `metricsConfig` block. -#### `pods_gauge` - -A gauge of the number of workflow created pods currently in the cluster in each phase. -It is possible for a workflow to start, but no pods be running (for example cluster is too busy to run them). -This metric sheds light on actual work being done. - -| attribute | explanation | -|-----------|------------------------------| -| `phase` | The phase that the pod is in | - #### `pod_missing` +Incidents of pod missing. A counter of pods that were not seen - for example they are by being deleted by Kubernetes. You should only see this under high load. - -| attribute | explanation | +| attribute | explanation | |--------------------|----------------------------------------| -| `recently_started` | Boolean: was this pod started recently | | `node_phase` | The phase that the pod's node was in | +| `recently_started` | Boolean: was this pod started recently | `recently_started` is controlled by the [environment variable](environment-variables.md) `RECENTLY_STARTED_POD_DURATION` and defaults to 10 seconds. #### `pod_pending_count` -A counter of pods that have been seen in the Pending state. +Total number of pods that started pending by reason. +| attribute | explanation | +|-------------|----------------------------------------------| +| `reason` | Summary of the kubernetes Reason for pending | +| `namespace` | The namespace that the pod is in | -| attribute | explanation | -|--------------------|-------------------------------------------| -| `reason` | Summary of the kubernetes Reason for pending. | -| `namespace` | The namespace in which the pod is running | +#### `pods_gauge` -This metric ignores the `PodInitializing` reason and does not count it. -The `reason` attribute is the value from the Reason message before the `:` in the message. -This is not directly controlled by the workflow controller, so it is possible for some pod pending states to be missed. +A gauge of the number of workflow created pods currently in the cluster in each phase. +It is possible for a workflow to start, but no pods be running (for example cluster is too busy to run them). +This metric sheds light on actual work being done. +| attribute | explanation | +|-----------|------------------------------| +| `phase` | The phase that the pod is in | #### `pods_total_count` -A gauge of the number of pods which have entered each phase and then observed by the controller. -This is not directly controlled by the workflow controller, so it is possible for some pod phases to be missed. +Total number of pods that have entered each phase. +| attribute | explanation | +|-------------|----------------------------------| +| `phase` | The phase that the pod is in | +| `namespace` | The namespace that the pod is in | -| attribute | explanation | -|-------------|-------------------------------------------| -| `phase` | The phase that the pod is in | -| `namespace` | The namespace in which the pod is running | +This metric ignores the `PodInitializing` reason and does not count it. +The `reason` attribute is the value from the Reason message before the `:` in the message. +This is not directly controlled by the workflow controller, so it is possible for some pod pending states to be missed. #### `queue_adds_count` A counter of additions to the work queues inside the controller. -The rate of this shows how busy that area of the controller is. - -| attribute | explanation | -|---------------|-------------------| -| `worker_type` | The type of queue | +The rate of this shows how busy that area of the controller is +| attribute | explanation | +|--------------|-----------------------| +| `queue_name` | The name of the queue | Queues: @@ -420,102 +411,176 @@ This and associated metrics are all directly sourced from the [client-go workque A gauge of the current depth of the queues. If these get large then the workflow controller is not keeping up with the cluster. +| attribute | explanation | +|--------------|-----------------------| +| `queue_name` | The name of the queue | -See [queue adds count](#queue_adds_count) for details. +Queues: + +- `cron_wf_queue`: the queue of CronWorkflow updates from the cluster +- `pod_cleanup_queue`: pods which are queued for deletion +- `workflow_queue`: the queue of Workflow updates from the cluster +- `workflow_ttl_queue`: workflows which are queued for deletion due to age +- `workflow_archive_queue`: workflows which are queued for archiving + +This and associated metrics are all directly sourced from the [client-go workqueue metrics](https://godocs.io/k8s.io/client-go/util/workqueue) #### `queue_duration` A histogram of the time events in the queues are taking to be processed. +| attribute | explanation | +|--------------|-----------------------| +| `queue_name` | The name of the queue | -See [queue adds count](#queue_adds_count) for details. +Default bucket sizes: 0.1, 0.2, 0.5, 1, 2, 5, 10, 20, 60, 180 +Queues: + +- `cron_wf_queue`: the queue of CronWorkflow updates from the cluster +- `pod_cleanup_queue`: pods which are queued for deletion +- `workflow_queue`: the queue of Workflow updates from the cluster +- `workflow_ttl_queue`: workflows which are queued for deletion due to age +- `workflow_archive_queue`: workflows which are queued for archiving + +This and associated metrics are all directly sourced from the [client-go workqueue metrics](https://godocs.io/k8s.io/client-go/util/workqueue) #### `queue_latency` A histogram of the time events in the queues are taking before they are processed. +| attribute | explanation | +|--------------|-----------------------| +| `queue_name` | The name of the queue | + +Default bucket sizes: 1, 5, 20, 60, 180 +Queues: -See [queue adds count](#queue_adds_count) for details. +- `cron_wf_queue`: the queue of CronWorkflow updates from the cluster +- `pod_cleanup_queue`: pods which are queued for deletion +- `workflow_queue`: the queue of Workflow updates from the cluster +- `workflow_ttl_queue`: workflows which are queued for deletion due to age +- `workflow_archive_queue`: workflows which are queued for archiving + +This and associated metrics are all directly sourced from the [client-go workqueue metrics](https://godocs.io/k8s.io/client-go/util/workqueue) #### `queue_longest_running` A gauge of the number of seconds that this queue's longest running processor has been running for. +| attribute | explanation | +|--------------|-----------------------| +| `queue_name` | The name of the queue | -See [queue adds count](#queue_adds_count) for details. +Queues: + +- `cron_wf_queue`: the queue of CronWorkflow updates from the cluster +- `pod_cleanup_queue`: pods which are queued for deletion +- `workflow_queue`: the queue of Workflow updates from the cluster +- `workflow_ttl_queue`: workflows which are queued for deletion due to age +- `workflow_archive_queue`: workflows which are queued for archiving + +This and associated metrics are all directly sourced from the [client-go workqueue metrics](https://godocs.io/k8s.io/client-go/util/workqueue) #### `queue_retries` -A counter of the number of times a message has been retried in the queue +A counter of the number of times a message has been retried in the queue. +| attribute | explanation | +|--------------|-----------------------| +| `queue_name` | The name of the queue | -See [queue adds count](#queue_adds_count) for details. +Queues: + +- `cron_wf_queue`: the queue of CronWorkflow updates from the cluster +- `pod_cleanup_queue`: pods which are queued for deletion +- `workflow_queue`: the queue of Workflow updates from the cluster +- `workflow_ttl_queue`: workflows which are queued for deletion due to age +- `workflow_archive_queue`: workflows which are queued for archiving + +This and associated metrics are all directly sourced from the [client-go workqueue metrics](https://godocs.io/k8s.io/client-go/util/workqueue) #### `queue_unfinished_work` A gauge of the number of queue items that have not been processed yet. +| attribute | explanation | +|--------------|-----------------------| +| `queue_name` | The name of the queue | -See [queue adds count](#queue_adds_count) for details. +Queues: + +- `cron_wf_queue`: the queue of CronWorkflow updates from the cluster +- `pod_cleanup_queue`: pods which are queued for deletion +- `workflow_queue`: the queue of Workflow updates from the cluster +- `workflow_ttl_queue`: workflows which are queued for deletion due to age +- `workflow_archive_queue`: workflows which are queued for archiving + +This and associated metrics are all directly sourced from the [client-go workqueue metrics](https://godocs.io/k8s.io/client-go/util/workqueue) #### `total_count` A counter of workflows that have entered each phase for tracking them through their life-cycle, by namespace. - -| attribute | explanation | -|-------------|------------------------------------------------| -| `phase` | The phase that the workflow has entered | -| `namespace` | The namespace in which the workflow is running | +| attribute | explanation | +|-------------|-----------------------------------------| +| `phase` | The phase that the Workflow has entered | +| `namespace` | The namespace that the Workflow is in | #### `version` Build metadata for this Controller. - -| attribute | explanation | +| attribute | explanation | |------------------|-------------------------------------------------------------------------------------------------------| | `version` | The version of Argo | | `platform` | The [Go platform](https://go.dev/doc/install/source#environment) compiled for. Example: `linux/amd64` | | `go_version` | Version of Go used | -| `build_date` | Build date | -| `compiler` | The compiler used. Example: `gc` | +| `build_date` | Build date | +| `compiler` | The compiler used. Example: `gc` | | `git_commit` | The full Git SHA1 commit | | `git_tree_state` | Whether the Git tree was `dirty` or `clean` when built | | `git_tag` | The Git tag or `untagged` if it was not tagged | #### `workers_busy_count` -A count of queue workers that are busy. +A gauge of queue workers that are busy. +| attribute | explanation | +|---------------|-------------------| +| `worker_type` | The type of queue | + +Worker Types: + +- `cron_wf_queue`: the queue of CronWorkflow updates from the cluster +- `pod_cleanup_queue`: pods which are queued for deletion +- `workflow_queue`: the queue of Workflow updates from the cluster +- `workflow_ttl_queue`: workflows which are queued for deletion due to age +- `workflow_archive_queue`: workflows which are queued for archiving -See [queue adds count](#queue_adds_count) for details. +This and associated metrics are all directly sourced from the [client-go workqueue metrics](https://godocs.io/k8s.io/client-go/util/workqueue) #### `workflow_condition` A gauge of the number of workflows with different conditions. This will tell you the number of workflows with running pods. - -| attribute | explanation | -|-----------|-------------------------------------------------| -| `type` | the type of condition, currently only `Running` | -| `status` | `true` or `false` | +| attribute | explanation | +|-----------|----------------------------------------------------| +| `type` | The type of condition, currently only `PodRunning` | +| `status` | Boolean: `true` or `false` | #### `workflowtemplate_runtime` -A histogram of the duration of workflows using `workflowTemplateRef` only, as they enter each phase. +A histogram of the runtime of workflows using `workflowTemplateRef` only. Counts both WorkflowTemplate and ClusterWorkflowTemplate usage. Records time between entering the `Running` phase and completion, so does not include any time in `Pending`. - -| attribute | explanation | -|-----------------|--------------------------------------------------------------| -| `cluster_scope` | A boolean set true if this is a ClusterWorkflowTemplate | -| `name` | ⚠️ The name of the WorkflowTemplate/ClusterWorkflowTemplate. | -| `namespace` | The namespace from which the WorkflowTemplate is being used | +| attribute | explanation | +|-----------------|-------------------------------------------------------------| +| `name` | ⚠️ The name of the WorkflowTemplate/ClusterWorkflowTemplate. | +| `namespace` | The namespace that the WorkflowTemplate is in | +| `cluster_scope` | A boolean set true if this is a ClusterWorkflowTemplate | #### `workflowtemplate_triggered_total` A counter of workflows using `workflowTemplateRef` only, as they enter each phase. Counts both WorkflowTemplate and ClusterWorkflowTemplate usage. - -| attribute | explanation | -|-----------------|--------------------------------------------------------------| -| `cluster_scope` | A boolean set true if this is a ClusterWorkflowTemplate | -| `name` | ⚠️ The name of the WorkflowTemplate/ClusterWorkflowTemplate. | -| `namespace` | The namespace from which the WorkflowTemplate is being used | -| `phase` | The phase that the workflow entered | +| attribute | explanation | +|-----------------|-------------------------------------------------------------| +| `name` | ⚠️ The name of the WorkflowTemplate/ClusterWorkflowTemplate. | +| `namespace` | The namespace that the WorkflowTemplate is in | +| `cluster_scope` | A boolean set true if this is a ClusterWorkflowTemplate | + ### Metric types diff --git a/go.mod b/go.mod index bb5df27b3378..12349fcf4837 100644 --- a/go.mod +++ b/go.mod @@ -38,6 +38,7 @@ require ( github.com/jcmturner/gokrb5/v8 v8.4.4 github.com/klauspost/pgzip v1.2.6 github.com/minio/minio-go/v7 v7.0.77 + github.com/nao1215/markdown v0.6.0 github.com/prometheus/client_golang v1.19.1 github.com/prometheus/common v0.55.0 github.com/robfig/cron/v3 v3.0.1 @@ -117,13 +118,17 @@ require ( github.com/jcmturner/dnsutils/v2 v2.0.0 // indirect github.com/jcmturner/goidentity/v6 v6.0.1 // indirect github.com/jcmturner/rpc/v2 v2.0.3 // indirect + github.com/karrick/godirwalk v1.17.0 // indirect github.com/mattn/go-colorable v0.1.13 // indirect github.com/mattn/go-isatty v0.0.20 // indirect + github.com/mattn/go-runewidth v0.0.15 // indirect github.com/ncruces/go-strftime v0.1.9 // indirect + github.com/olekukonko/tablewriter v0.0.5 // indirect github.com/pjbgf/sha1cd v0.3.0 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/prometheus/client_model v0.6.1 // indirect github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect + github.com/rivo/uniseg v0.4.4 // indirect github.com/sagikazarmark/locafero v0.4.0 // indirect github.com/sagikazarmark/slog-shim v0.1.0 // indirect github.com/segmentio/fasthash v1.0.3 // indirect diff --git a/go.sum b/go.sum index 60644ad71a5b..30d9f55348a1 100644 --- a/go.sum +++ b/go.sum @@ -552,6 +552,8 @@ github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHm github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU= github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU= github.com/karrick/godirwalk v1.7.8/go.mod h1:2c9FRhkDxdIbgkOnCEvnSWs71Bhugbl46shStcFDJ34= +github.com/karrick/godirwalk v1.17.0 h1:b4kY7nqDdioR/6qnbHQyDvmA17u5G1cZ6J+CZXwSWoI= +github.com/karrick/godirwalk v1.17.0/go.mod h1:j4mkqPuvaLI8mp1DroR3P6ad7cyYd4c1qeJ3RV7ULlk= github.com/kevinburke/ssh_config v1.2.0 h1:x584FjTGwHzMwvHx18PXxbBVzfnxogHaAReU4gf13a4= github.com/kevinburke/ssh_config v1.2.0/go.mod h1:CT57kijsi8u/K/BOFA39wgDQJ9CxiF4nAY/ojJ6r6mM= github.com/kisielk/errcheck v1.2.0/go.mod h1:/BMXB+zMLi60iA8Vv6Ksmxu/1UDYcXs4uQLJ+jE2L00= @@ -613,6 +615,9 @@ github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWE github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= github.com/mattn/go-runewidth v0.0.2/go.mod h1:LwmH8dsx7+W8Uxz3IHJYH5QSwggIsqBzpuz5H//U1FU= github.com/mattn/go-runewidth v0.0.3/go.mod h1:LwmH8dsx7+W8Uxz3IHJYH5QSwggIsqBzpuz5H//U1FU= +github.com/mattn/go-runewidth v0.0.9/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI= +github.com/mattn/go-runewidth v0.0.15 h1:UNAjwbU9l54TA3KzvqLGxwWjHmMgBUVhBiTjelZgg3U= +github.com/mattn/go-runewidth v0.0.15/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w= github.com/mattn/go-sqlite3 v1.14.17/go.mod h1:2eHXhiwb8IkHr+BDWZGa96P6+rkvnG63S2DGjv9HUNg= github.com/minio/md5-simd v1.1.2 h1:Gdi1DZK69+ZVMoNHRXJyNcxrMA4dSxoYHZSQbirFg34= github.com/minio/md5-simd v1.1.2/go.mod h1:MzdKDxYpY2BT9XQFocsiZf/NKVtR7nkE4RoEpN+20RM= @@ -653,6 +658,8 @@ github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus= github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw= +github.com/nao1215/markdown v0.6.0 h1:kqhrC47K434YA1jMTUwJwSV/hla8ifN3NzehMEffI/E= +github.com/nao1215/markdown v0.6.0/go.mod h1:ObBhnNduWwPN+bu4dtv4JoLRt57ONla7l//03iHIVhY= github.com/ncruces/go-strftime v0.1.9 h1:bY0MQC28UADQmHmaF5dgpLmImcShSi2kHU9XLdhx/f4= github.com/ncruces/go-strftime v0.1.9/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls= github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= @@ -660,6 +667,8 @@ github.com/nsf/termbox-go v0.0.0-20190121233118-02980233997d/go.mod h1:IuKpRQcYE github.com/nxadm/tail v1.4.4/go.mod h1:kenIhsEOeOJmVchQTgglprH7qJGnHDVpk1VPCcaMI8A= github.com/nxadm/tail v1.4.8 h1:nPr65rt6Y5JFSKQO7qToXr7pePgD6Gwiw05lkbyAQTE= github.com/nxadm/tail v1.4.8/go.mod h1:+ncqLTQzXmGhMZNUePPaPqPvBxHAIsmXswZKocGu+AU= +github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N7AbDhec= +github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY= github.com/onsi/ginkgo v0.0.0-20170829012221-11459a886d9c/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= github.com/onsi/ginkgo v1.10.1/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= @@ -710,6 +719,9 @@ github.com/remyoudompheng/bigfft v0.0.0-20190728182440-6a916e37a237/go.mod h1:qq github.com/remyoudompheng/bigfft v0.0.0-20200410134404-eec4a21b6bb0/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo= github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE= github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo= +github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= +github.com/rivo/uniseg v0.4.4 h1:8TfxU8dW6PdqD27gjM8MVNuicgxIjxpm4K7x4jp8sis= +github.com/rivo/uniseg v0.4.4/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs= github.com/robfig/cron/v3 v3.0.1/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzGIFLtro= github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ= diff --git a/util/telemetry/attributes.go b/util/telemetry/attributes.go index e2ca6700731f..b1e25a9975fc 100644 --- a/util/telemetry/attributes.go +++ b/util/telemetry/attributes.go @@ -1,46 +1,36 @@ +// Code generated by util/telemetry/builder. DO NOT EDIT. package telemetry const ( - AttribBuildVersion string = `version` - AttribBuildPlatform string = `platform` - AttribBuildGoVersion string = `go_version` - AttribBuildDate string = `build_date` AttribBuildCompiler string = `compiler` + AttribBuildDate string = `build_date` AttribBuildGitCommit string = `git_commit` - AttribBuildGitTreeState string = `git_treestate` AttribBuildGitTag string = `git_tag` - - AttribCronWFName string = `name` + AttribBuildGitTreeState string = `git_tree_state` + AttribBuildGoVersion string = `go_version` + AttribBuildPlatform string = `platform` + AttribBuildVersion string = `version` AttribConcurrencyPolicy string = `concurrency_policy` - - AttribDeprecatedFeature string = "feature" - - AttribErrorCause string = "cause" - - AttribLogLevel string = `level` - - AttribNodePhase string = `node_phase` - - AttribPodPhase string = `phase` - AttribPodNamespace string = `namespace` - AttribPodPendingReason string = `reason` - - AttribQueueName string = `queue_name` - - AttribRecentlyStarted string = `recently_started` - - AttribRequestKind = `kind` - AttribRequestVerb = `verb` - AttribRequestCode = `status_code` - + AttribCronWFName string = `name` + AttribCronWFNamespace string = `namespace` + AttribDeprecatedFeature string = `feature` + AttribErrorCause string = `cause` + AttribLogLevel string = `level` + AttribNodePhase string = `node_phase` + AttribPodNamespace string = `namespace` + AttribPodPendingReason string = `reason` + AttribPodPhase string = `phase` + AttribQueueName string = `queue_name` + AttribRecentlyStarted string = `recently_started` + AttribRequestCode string = `status_code` + AttribRequestKind string = `kind` + AttribRequestVerb string = `verb` + AttribTemplateCluster string = `cluster_scope` AttribTemplateName string = `name` AttribTemplateNamespace string = `namespace` - AttribTemplateCluster string = `cluster_scope` - - AttribWorkerType string = `worker_type` - + AttribWorkerType string = `worker_type` AttribWorkflowNamespace string = `namespace` AttribWorkflowPhase string = `phase` - AttribWorkflowStatus = `status` - AttribWorkflowType = `type` + AttribWorkflowStatus string = `status` + AttribWorkflowType string = `type` ) diff --git a/util/telemetry/builder/builder.go b/util/telemetry/builder/builder.go new file mode 100644 index 000000000000..936538057448 --- /dev/null +++ b/util/telemetry/builder/builder.go @@ -0,0 +1,192 @@ +package main + +import ( + _ "embed" + "errors" + "flag" + "fmt" + "os" + "regexp" + "slices" + "strings" + "unicode" + + "sigs.k8s.io/yaml" +) + +const generatedBanner string = "// Code generated by util/telemetry/builder. DO NOT EDIT." + +//go:embed values.yaml +var valuesYaml []byte + +type attribute struct { + Name string `json:"name"` + DisplayName string `json:"displayName,omitempty"` + // Description is a markdown explanation for the documentation. One line only. + Description string `json:"description"` +} + +type allowedAttribute struct { + Name string `json:"name"` + Optional bool `json:"optional,omitempty"` +} + +type metric struct { + // Name: Metric name, in CamelCaps + // Will be snake cased for display purposes + Name string `json:"name"` + // Description: short description, emitted on the metrics endpoint and added to the documentation. Do not use marrkdown here. + Description string `json:"description"` + // ExtendedDescription: Markdown capable further description added to the documentation before attributes + ExtendedDescription string `json:"extendedDescription,omitempty"` + // Notes: Markdown capable further description added to the documentation after attributes + Notes string `json:"notes,omitempty"` + Attributes []allowedAttribute `json:"attributes,omitempty"` + // Unit: OpenTelemetry unit of measurement https://opentelemetry.io/docs/specs/otel/metrics/api/#instrument-unit + Unit string `json:"unit"` + Type string `json:"type"` + DefaultBuckets []float64 `json:"defaultBuckets,omitempty"` +} + +type attributesList []attribute +type metricsList []metric + +type values struct { + Attributes attributesList `json:"attributes"` + Metrics metricsList `json:"metrics"` +} + +func load() values { + var vals values + err := yaml.UnmarshalStrict(valuesYaml, &vals) + if err != nil { + panic(err) + } + return vals +} + +var collectedErrors []error + +func recordErrorString(err string) { + collectedErrors = append(collectedErrors, errors.New(err)) +} +func recordError(err error) { + collectedErrors = append(collectedErrors, err) +} + +func main() { + metricsDocs := flag.String("metricsDocs", "", "Path to metrics.md in the docs") + attributesGo := flag.String("attributesGo", "", "Path to attributes.go in util/telemetry") + metricsListGo := flag.String("metricsListGo", "", "Path to metrics_list.go in util/telemetry") + flag.Parse() + vals := load() + validate(&vals) + if len(collectedErrors) == 0 { + if metricsDocs != nil && *metricsDocs != "" { + createMetricsDocs(*metricsDocs, &vals.Metrics, &vals.Attributes) + } + if attributesGo != nil && *attributesGo != "" { + createAttributesGo(*attributesGo, &vals.Attributes) + } + if metricsListGo != nil && *metricsListGo != "" { + createMetricsListGo(*metricsListGo, &vals.Metrics) + } + } + if len(collectedErrors) > 0 { + for _, err := range collectedErrors { + fmt.Println(err) + } + os.Exit(1) + } +} + +func upperToSnake(in string) string { + runes := []rune(in) + in = string(append([]rune{unicode.ToLower(runes[0])}, runes[1:]...)) + re := regexp.MustCompile(`[A-Z]`) + return string(re.ReplaceAllFunc([]byte(in), func(in []byte) []byte { + return []byte(fmt.Sprintf("_%s", strings.ToLower(string(in[0])))) + })) +} + +func (a *attribute) displayName() string { + name := a.Name + if a.DisplayName != "" { + name = a.DisplayName + } + return upperToSnake(name) +} + +func validateMetricsAttributes(metrics *metricsList, attributes *attributesList) { + for _, metric := range *metrics { + for _, attribute := range metric.Attributes { + if getAttribByName(attribute.Name, attributes) == nil { + recordErrorString(fmt.Sprintf("Metric %s: attribute %s not defined", metric.Name, attribute.Name)) + } + } + } +} + +func validateAttributes(attributes *attributesList) { + if !slices.IsSortedFunc(*attributes, func(a, b attribute) int { + return strings.Compare(a.Name, b.Name) + }) { + recordErrorString("Attributes must be alphabetically sorted by Name") + } + for _, attribute := range *attributes { + if strings.Contains(attribute.Description, "\n") { + recordErrorString(fmt.Sprintf("%s: Description must be a single line", attribute.Name)) + } + } +} + +func validateMetrics(metrics *metricsList) { + if !slices.IsSortedFunc(*metrics, func(a, b metric) int { + return strings.Compare(a.Name, b.Name) + }) { + recordErrorString("Metrics must be alphabetically sorted by Name") + } + for _, metric := range *metrics { + // This is easier than enum+custom JSON unmarshall as this is not critical code + switch metric.Type { + case "Float64Histogram": + case "Float64ObservableGauge": + case "Int64Counter": + case "Int64UpDownCounter": + case "Int64ObservableGauge": + break + default: + recordErrorString(fmt.Sprintf("%s: Invalid metric type %s", metric.Name, metric.Type)) + } + if strings.Contains(metric.Description, "\n") { + recordErrorString(fmt.Sprintf("%s: Description must be a single line", metric.Name)) + } + if strings.HasSuffix(metric.Description, ".") { + recordErrorString(fmt.Sprintf("%s: Description must not have a trailing period", metric.Name)) + } + } +} + +func validate(vals *values) { + validateAttributes(&vals.Attributes) + validateMetrics(&vals.Metrics) + validateMetricsAttributes(&vals.Metrics, &vals.Attributes) +} + +func (m *metric) instrumentType() string { + return m.Type +} + +func (m *metric) displayName() string { + name := m.Name + return upperToSnake(name) +} + +func getAttribByName(name string, attribs *attributesList) *attribute { + for _, attrib := range *attribs { + if name == attrib.Name { + return &attrib + } + } + return nil +} diff --git a/util/telemetry/builder/docs.go b/util/telemetry/builder/docs.go new file mode 100644 index 000000000000..70e1bb8bd7c8 --- /dev/null +++ b/util/telemetry/builder/docs.go @@ -0,0 +1,110 @@ +package main + +import ( + "bytes" + "fmt" + "io" + "os" + "slices" + "strings" + + md "github.com/nao1215/markdown" +) + +func createMetricsDocs(filename string, metrics *metricsList, attribs *attributesList) { + input, err := os.ReadFile(filename) + if err != nil { + recordError(err) + return + } + + lines := strings.Split(string(input), "\n") + + const begin = "Generated documentation BEGIN" + const end = "Generated documentation END" + type stageType int + const ( + beforeBegin stageType = iota + seekingEnd + finishing + ) + stage := beforeBegin + var cutfrom int + for i, line := range lines { + switch stage { + case beforeBegin: + if strings.Contains(line, begin) { + stage = seekingEnd + lines = slices.Insert(lines, i+1, metricsDocsLines(metrics, attribs)) + cutfrom = i + 2 + } + case seekingEnd: + if strings.Contains(line, end) { + stage = finishing + lines = slices.Delete(lines, cutfrom, i+1) + } + case finishing: + // Do nothing + } + } + if stage != finishing { + recordErrorString(fmt.Sprintf("Didn't successfully replace docs in %s", filename)) + return + } + + output := strings.Join(lines, "\n") + err = os.WriteFile(filename, []byte(output), 0444) + if err != nil { + recordError(err) + return + } +} + +func metricsDocsLines(metrics *metricsList, attribs *attributesList) string { + var out bytes.Buffer + outWriter := io.Writer(&out) + markdown := md.NewMarkdown(outWriter) + for _, metric := range *metrics { + markdown.PlainText("") + markdown.H4(md.Code(metric.displayName())) + markdown.PlainText("") + markdown.PlainTextf("%s.", metric.Description) + if metric.ExtendedDescription != "" { + markdown.PlainText(strings.Trim(metric.ExtendedDescription, " \n\t\r")) + } + + if len(metric.Attributes) > 0 { + rows := [][]string{} + for _, metricAttrib := range metric.Attributes { + if attrib := getAttribByName(metricAttrib.Name, attribs); attrib != nil { + // Failure should already be recorded as an error + rows = append(rows, []string{md.Code(attrib.displayName()), attrib.Description}) + } + } + + markdown.CustomTable(md.TableSet{ + Header: []string{"attribute", "explanation"}, + Rows: rows, + }, md.TableOptions{AutoWrapText: false}, + ) + } else { + markdown.PlainText("This metric has no attributes.") + } + if len(metric.DefaultBuckets) > 0 { + buckets := "" + for i, bucket := range metric.DefaultBuckets { + if i != 0 { + buckets = fmt.Sprintf("%s, ", buckets) + } + buckets = fmt.Sprintf("%s%g", buckets, bucket) + } + markdown.PlainTextf("Default bucket sizes: %s", buckets) + } + + if metric.Notes != "" { + markdown.PlainText(strings.Trim(metric.Notes, " \n\t\r")) + } + } + markdown.Build() + return strings.TrimSuffix(out.String(), "\n") +} diff --git a/util/telemetry/builder/go.go b/util/telemetry/builder/go.go new file mode 100644 index 000000000000..b1e1f415fa81 --- /dev/null +++ b/util/telemetry/builder/go.go @@ -0,0 +1,81 @@ +package main + +import ( + "bytes" + "fmt" + "os" + "os/exec" +) + +func createMetricsListGo(filename string, metrics *metricsList) { + writeMetricsListGo(filename, metrics) + goFmtFile(filename) +} + +func writeMetricsListGo(filename string, metrics *metricsList) { + f, err := os.Create(filename) + if err != nil { + recordError(err) + return + } + defer f.Close() + fmt.Fprintf(f, "%s\n", generatedBanner) + fmt.Fprintf(f, "package telemetry\n\n") + for _, metric := range *metrics { + fmt.Fprintf(f, "var Instrument%s = BuiltinInstrument{\n", metric.Name) + fmt.Fprintf(f, "\tname: \"%s\",\n", metric.displayName()) + fmt.Fprintf(f, "\tdescription: \"%s\",\n", metric.Description) + fmt.Fprintf(f, "\tunit: \"%s\",\n", metric.Unit) + fmt.Fprintf(f, "\tinstType: %s,\n", metric.instrumentType()) + if len(metric.Attributes) > 0 { + fmt.Fprintf(f, "\tattributes: []BuiltinAttribute{\n") + for _, attrib := range metric.Attributes { + fmt.Fprintf(f, "\t\t{\n\t\t\tname: Attrib%s,\n", attrib.Name) + if attrib.Optional { + fmt.Fprintf(f, "\t\t\toptional: true,\n") + } + fmt.Fprintf(f, "\t\t},\n") + } + fmt.Fprintf(f, "\t},\n") + } + if len(metric.DefaultBuckets) > 0 { + fmt.Fprintf(f, "\tdefaultBuckets: []float64{\n") + for _, bucket := range metric.DefaultBuckets { + fmt.Fprintf(f, "\t\t%f,\n", bucket) + } + fmt.Fprintf(f, "\t},\n") + } + + fmt.Fprintf(f, "}\n\n") + } +} + +func createAttributesGo(filename string, attributes *attributesList) { + writeAttributesGo(filename, attributes) + goFmtFile(filename) +} + +func writeAttributesGo(filename string, attributes *attributesList) { + f, err := os.Create(filename) + if err != nil { + recordError(err) + return + } + defer f.Close() + fmt.Fprintf(f, "%s\n", generatedBanner) + fmt.Fprintf(f, "package telemetry\n\nconst (\n") + for _, attrib := range *attributes { + fmt.Fprintf(f, "\tAttrib%s string = `%s`\n", attrib.Name, attrib.displayName()) + } + fmt.Fprintf(f, ")\n") +} + +func goFmtFile(filename string) { + cmd := exec.Command("go", "fmt", filename) + var stderr bytes.Buffer + cmd.Stderr = &stderr + _, err := cmd.Output() + if err != nil { + recordErrorString(fmt.Sprintf("%s: %s", "go fmt failed", stderr.String())) + } +} diff --git a/util/telemetry/builder/values.yaml b/util/telemetry/builder/values.yaml new file mode 100644 index 000000000000..383fbbde5d8b --- /dev/null +++ b/util/telemetry/builder/values.yaml @@ -0,0 +1,406 @@ +attributes: + - name: BuildCompiler + displayName: compiler + description: "The compiler used. Example: `gc`" + - name: BuildDate + description: Build date + - name: BuildGitCommit + displayName: git_commit + description: The full Git SHA1 commit + - name: BuildGitTag + displayName: git_tag + description: "The Git tag or `untagged` if it was not tagged" + - name: BuildGitTreeState + displayName: git_tree_state + description: "Whether the Git tree was `dirty` or `clean` when built" + - name: BuildGoVersion + displayName: go_version + description: Version of Go used + - name: BuildPlatform + displayName: platform + description: "The [Go platform](https://go.dev/doc/install/source#environment) compiled for. Example: `linux/amd64`" + - name: BuildVersion + displayName: version + description: The version of Argo + - name: ConcurrencyPolicy + description: "The concurrency policy which was triggered, will be either `Forbid` or `Replace`" + - name: CronWFName + displayName: name + description: "⚠️ The name of the CronWorkflow" + - name: CronWFNamespace + displayName: namespace + description: The namespace that the CronWorkflow is in + - name: DeprecatedFeature + displayName: feature + description: The name of the feature used + - name: ErrorCause + displayName: cause + description: The cause of the error + - name: LogLevel + displayName: level + description: The log level of the message + - name: NodePhase + description: "The phase that the pod's node was in" + - name: PodNamespace + displayName: namespace + description: The namespace that the pod is in + - name: PodPendingReason + displayName: reason + description: Summary of the kubernetes Reason for pending + - name: PodPhase + displayName: phase + description: The phase that the pod is in + - name: QueueName + description: The name of the queue + - name: RecentlyStarted + description: "Boolean: was this pod started recently" + - name: RequestCode + displayName: status_code + description: The HTTP status code of the response + - name: RequestKind + displayName: kind + description: "The kubernetes `kind` involved in the request such as `configmaps`" + - name: RequestVerb + displayName: verb + description: "The verb of the request, such as `Get` or `List`" + - name: TemplateCluster + displayName: cluster_scope + description: A boolean set true if this is a ClusterWorkflowTemplate + - name: TemplateName + displayName: name + description: "⚠️ The name of the WorkflowTemplate/ClusterWorkflowTemplate." + - name: TemplateNamespace + displayName: namespace + description: The namespace that the WorkflowTemplate is in + - name: WorkerType + description: The type of queue + - name: WorkflowNamespace + displayName: namespace + description: The namespace that the Workflow is in + - name: WorkflowPhase + displayName: phase + description: The phase that the Workflow has entered + - name: WorkflowStatus + displayName: status + description: "Boolean: `true` or `false`" + - name: WorkflowType + displayName: type + description: "The type of condition, currently only `PodRunning`" + +metrics: + - name: CronworkflowsConcurrencypolicyTriggered + description: A counter of the number of times a CronWorkflow has triggered its `concurrencyPolicy` to limit the number of workflows running + attributes: + - name: CronWFName + - name: CronWFNamespace + - name: ConcurrencyPolicy + unit: "{cronworkflow}" + type: Int64Counter + - name: CronworkflowsTriggeredTotal + description: A counter of the total number of times a CronWorkflow has been triggered + extendedDescription: "Suppressed runs due to `concurrencyPolicy: Forbid` will not be counted." + attributes: + - name: CronWFName + - name: CronWFNamespace + unit: "{cronworkflow}" + type: Int64Counter + - name: DeprecatedFeature + description: "Incidents of deprecated feature being used" + extendedDescription: | + Deprecated features are [explained here](deprecations.md). + 🚨 This counter may go up much more than once for a single use of the feature. + notes: | + `feature` will be one of: + + - [`cronworkflow schedule`](deprecations.md#cronworkflow_schedule) + - [`synchronization mutex`](deprecations.md#synchronization_mutex) + - [`synchronization semaphore`](deprecations.md#synchronization_semaphore) + - [`workflow podpriority`](deprecations.md#workflow_podpriority) + attributes: + - name: DeprecatedFeature + - name: WorkflowNamespace + optional: true + unit: "{feature}" + type: Int64Counter + - name: ErrorCount + description: A counter of certain errors incurred by the controller by cause + notes: | + The currently tracked specific errors are + + - `OperationPanic` - the controller called `panic()` on encountering a programming bug + - `CronWorkflowSubmissionError` - A CronWorkflow failed submission + - `CronWorkflowSpecError` - A CronWorkflow has an invalid specification + attributes: + - name: ErrorCause + unit: "{error}" + type: Int64Counter + - name: Gauge + description: A gauge of the number of workflows currently in the cluster in each phase + extendedDescription: | + The `Running` count does not mean that a workflows pods are running, just that the controller has scheduled them. + A workflow can be stuck in `Running` with pending pods for a long time. + attributes: + - name: WorkflowStatus + unit: "{workflow}" + type: Int64ObservableGauge + - name: IsLeader + description: Emits 1 if leader, 0 otherwise. Always 1 if leader election is disabled + extendedDescription: | + A gauge indicating if this Controller is the [leader](high-availability.md#workflow-controller). + + - `1` if leader or in standalone mode via [`LEADER_ELECTION_DISABLE=true`](environment-variables.md#controller). + - `0` otherwise, indicating that this controller is a standby that is not currently running workflows. + unit: "{leader}" + type: Int64ObservableGauge + - name: K8sRequestDuration + description: A histogram recording the API requests sent to the Kubernetes API + notes: This contains all the information contained in `k8s_request_total` along with timings. + attributes: + - name: RequestKind + - name: RequestVerb + - name: RequestCode + unit: "s" + type: Float64Histogram + defaultBuckets: [0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0, 60.0, 180.0] + - name: K8sRequestTotal + description: A counter of the number of API requests sent to the Kubernetes API + notes: This metric is calculable from `k8s_request_duration`, and it is suggested you just collect that metric instead. + attributes: + - name: RequestKind + - name: RequestVerb + - name: RequestCode + unit: "{request}" + type: Int64Counter + - name: LogMessages + description: "A count of log messages emitted by the controller by log level: `error`, `warn` and `info`" + attributes: + - name: LogLevel + unit: "{message}" + type: Int64Counter + - name: OperationDurationSeconds + description: A histogram of durations of operations + extendedDescription: | + An operation is a single workflow reconciliation loop within the workflow-controller. + It's the time for the controller to process a single workflow after it has been read from the cluster and is a measure of the performance of the controller affected by the complexity of the workflow. + notes: The environment variables `OPERATION_DURATION_METRIC_BUCKET_COUNT` and `MAX_OPERATION_TIME` configure the bucket sizes for this metric, unless they are specified using an `histogramBuckets` modifier in the `metricsConfig` block. + unit: "s" + type: Float64Histogram + - name: PodMissing + description: "Incidents of pod missing" + extendedDescription: | + A counter of pods that were not seen - for example they are by being deleted by Kubernetes. + You should only see this under high load. + attributes: + - name: NodePhase + - name: RecentlyStarted + notes: "`recently_started` is controlled by the [environment variable](environment-variables.md) `RECENTLY_STARTED_POD_DURATION` and defaults to 10 seconds." + unit: "{pod}" + type: Int64Counter + - name: PodPendingCount + description: "Total number of pods that started pending by reason" + attributes: + - name: PodPendingReason + - name: PodNamespace + unit: "{pod}" + type: Int64Counter + - name: PodsGauge + description: A gauge of the number of workflow created pods currently in the cluster in each phase + extendedDescription: | + It is possible for a workflow to start, but no pods be running (for example cluster is too busy to run them). + This metric sheds light on actual work being done. + attributes: + - name: PodPhase + unit: "{pod}" + type: Int64ObservableGauge + - name: PodsTotalCount + description: "Total number of pods that have entered each phase" + attributes: + - name: PodPhase + - name: PodNamespace + notes: | + This metric ignores the `PodInitializing` reason and does not count it. + The `reason` attribute is the value from the Reason message before the `:` in the message. + This is not directly controlled by the workflow controller, so it is possible for some pod pending states to be missed. + unit: "{pod}" + type: Int64ObservableGauge + - name: QueueAddsCount + description: A counter of additions to the work queues inside the controller + extendedDescription: The rate of this shows how busy that area of the controller is + notes: | + Queues: + + - `cron_wf_queue`: the queue of CronWorkflow updates from the cluster + - `pod_cleanup_queue`: pods which are queued for deletion + - `workflow_queue`: the queue of Workflow updates from the cluster + - `workflow_ttl_queue`: workflows which are queued for deletion due to age + - `workflow_archive_queue`: workflows which are queued for archiving + + This and associated metrics are all directly sourced from the [client-go workqueue metrics](https://godocs.io/k8s.io/client-go/util/workqueue) + attributes: + - name: QueueName + unit: "{item}" + type: Int64Counter + - name: QueueDepthGauge + description: A gauge of the current depth of the queues + extendedDescription: If these get large then the workflow controller is not keeping up with the cluster. + notes: | + Queues: + + - `cron_wf_queue`: the queue of CronWorkflow updates from the cluster + - `pod_cleanup_queue`: pods which are queued for deletion + - `workflow_queue`: the queue of Workflow updates from the cluster + - `workflow_ttl_queue`: workflows which are queued for deletion due to age + - `workflow_archive_queue`: workflows which are queued for archiving + + This and associated metrics are all directly sourced from the [client-go workqueue metrics](https://godocs.io/k8s.io/client-go/util/workqueue) + attributes: + - name: QueueName + unit: "{item}" + type: Int64UpDownCounter + - name: QueueDuration + description: A histogram of the time events in the queues are taking to be processed + notes: | + Queues: + + - `cron_wf_queue`: the queue of CronWorkflow updates from the cluster + - `pod_cleanup_queue`: pods which are queued for deletion + - `workflow_queue`: the queue of Workflow updates from the cluster + - `workflow_ttl_queue`: workflows which are queued for deletion due to age + - `workflow_archive_queue`: workflows which are queued for archiving + + This and associated metrics are all directly sourced from the [client-go workqueue metrics](https://godocs.io/k8s.io/client-go/util/workqueue) + attributes: + - name: QueueName + unit: s + type: Float64Histogram + defaultBuckets: [0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0, 60.0, 180.0] + - name: QueueLatency + description: A histogram of the time events in the queues are taking before they are processed + notes: | + Queues: + + - `cron_wf_queue`: the queue of CronWorkflow updates from the cluster + - `pod_cleanup_queue`: pods which are queued for deletion + - `workflow_queue`: the queue of Workflow updates from the cluster + - `workflow_ttl_queue`: workflows which are queued for deletion due to age + - `workflow_archive_queue`: workflows which are queued for archiving + + This and associated metrics are all directly sourced from the [client-go workqueue metrics](https://godocs.io/k8s.io/client-go/util/workqueue) + attributes: + - name: QueueName + unit: s + type: Float64Histogram + defaultBuckets: [1.0, 5.0, 20.0, 60.0, 180.0] + - name: QueueLongestRunning + description: A gauge of the number of seconds that this queue's longest running processor has been running for + notes: | + Queues: + + - `cron_wf_queue`: the queue of CronWorkflow updates from the cluster + - `pod_cleanup_queue`: pods which are queued for deletion + - `workflow_queue`: the queue of Workflow updates from the cluster + - `workflow_ttl_queue`: workflows which are queued for deletion due to age + - `workflow_archive_queue`: workflows which are queued for archiving + + This and associated metrics are all directly sourced from the [client-go workqueue metrics](https://godocs.io/k8s.io/client-go/util/workqueue) + attributes: + - name: QueueName + unit: s + type: Float64ObservableGauge + - name: QueueRetries + description: A counter of the number of times a message has been retried in the queue + notes: | + Queues: + + - `cron_wf_queue`: the queue of CronWorkflow updates from the cluster + - `pod_cleanup_queue`: pods which are queued for deletion + - `workflow_queue`: the queue of Workflow updates from the cluster + - `workflow_ttl_queue`: workflows which are queued for deletion due to age + - `workflow_archive_queue`: workflows which are queued for archiving + + This and associated metrics are all directly sourced from the [client-go workqueue metrics](https://godocs.io/k8s.io/client-go/util/workqueue) + attributes: + - name: QueueName + unit: "{item}" + type: Int64Counter + - name: QueueUnfinishedWork + description: A gauge of the number of queue items that have not been processed yet + notes: | + Queues: + + - `cron_wf_queue`: the queue of CronWorkflow updates from the cluster + - `pod_cleanup_queue`: pods which are queued for deletion + - `workflow_queue`: the queue of Workflow updates from the cluster + - `workflow_ttl_queue`: workflows which are queued for deletion due to age + - `workflow_archive_queue`: workflows which are queued for archiving + + This and associated metrics are all directly sourced from the [client-go workqueue metrics](https://godocs.io/k8s.io/client-go/util/workqueue) + attributes: + - name: QueueName + unit: "{item}" + type: Float64ObservableGauge + - name: TotalCount + description: A counter of workflows that have entered each phase for tracking them through their life-cycle, by namespace + attributes: + - name: WorkflowPhase + - name: WorkflowNamespace + unit: "{workflow}" + type: Int64Counter + - name: Version + description: "Build metadata for this Controller" + attributes: + - name: BuildVersion + - name: BuildPlatform + - name: BuildGoVersion + - name: BuildDate + - name: BuildCompiler + - name: BuildGitCommit + - name: BuildGitTreeState + - name: BuildGitTag + unit: "{unused}" + type: Int64Counter + - name: WorkersBusyCount + description: A gauge of queue workers that are busy + notes: | + Worker Types: + + - `cron_wf_queue`: the queue of CronWorkflow updates from the cluster + - `pod_cleanup_queue`: pods which are queued for deletion + - `workflow_queue`: the queue of Workflow updates from the cluster + - `workflow_ttl_queue`: workflows which are queued for deletion due to age + - `workflow_archive_queue`: workflows which are queued for archiving + + This and associated metrics are all directly sourced from the [client-go workqueue metrics](https://godocs.io/k8s.io/client-go/util/workqueue) + attributes: + - name: WorkerType + unit: "{worker}" + type: Int64UpDownCounter + - name: WorkflowCondition + description: A gauge of the number of workflows with different conditions + extendedDescription: This will tell you the number of workflows with running pods. + attributes: + - name: WorkflowType + - name: WorkflowStatus + unit: "{workflow}" + type: Int64ObservableGauge + - name: WorkflowtemplateRuntime + description: A histogram of the runtime of workflows using `workflowTemplateRef` only + extendedDescription: | + Counts both WorkflowTemplate and ClusterWorkflowTemplate usage. + Records time between entering the `Running` phase and completion, so does not include any time in `Pending`. + attributes: + - name: TemplateName + - name: TemplateNamespace + - name: TemplateCluster + unit: s + type: Float64Histogram + - name: WorkflowtemplateTriggeredTotal + description: A counter of workflows using `workflowTemplateRef` only, as they enter each phase + extendedDescription: | + Counts both WorkflowTemplate and ClusterWorkflowTemplate usage. + attributes: + - name: TemplateName + - name: TemplateNamespace + - name: TemplateCluster + unit: "{workflow_template}" + type: Int64Counter diff --git a/util/telemetry/builtinInstrument.go b/util/telemetry/builtinInstrument.go new file mode 100644 index 000000000000..e8b242ae15fe --- /dev/null +++ b/util/telemetry/builtinInstrument.go @@ -0,0 +1,37 @@ +package telemetry + +//import () + +type BuiltinAttribute struct { + name string + optional bool +} + +type BuiltinInstrument struct { + name string + description string + unit string + instType instrumentType + attributes []BuiltinAttribute + defaultBuckets []float64 +} + +func (bi *BuiltinInstrument) Name() string { + return bi.name +} + +// CreateBuiltinInstrument adds a yaml defined builtin instrument +// opts parameter is for legacy metrics, do not use for new metrics +func (m *Metrics) CreateBuiltinInstrument(instrument BuiltinInstrument, opts ...instrumentOption) error { + opts = append(opts, WithAsBuiltIn()) + if len(instrument.defaultBuckets) > 0 { + opts = append(opts, + WithDefaultBuckets(instrument.defaultBuckets)) + } + return m.CreateInstrument(instrument.instType, + instrument.name, + instrument.description, + instrument.unit, + opts..., + ) +} diff --git a/util/telemetry/counter_deprecations.go b/util/telemetry/counter_deprecations.go index 7f4bbec320f2..3fd8c90461a2 100644 --- a/util/telemetry/counter_deprecations.go +++ b/util/telemetry/counter_deprecations.go @@ -4,17 +4,8 @@ import ( "context" ) -const ( - nameDeprecated = `deprecated_feature` -) - func AddDeprecationCounter(_ context.Context, m *Metrics) error { - return m.CreateInstrument(Int64Counter, - nameDeprecated, - "Incidents of deprecated feature being used.", - "{feature}", - WithAsBuiltIn(), - ) + return m.CreateBuiltinInstrument(InstrumentDeprecatedFeature) } func (m *Metrics) DeprecatedFeature(ctx context.Context, deprecation string, namespace string) { @@ -24,5 +15,5 @@ func (m *Metrics) DeprecatedFeature(ctx context.Context, deprecation string, nam if namespace != "" { attribs = append(attribs, InstAttrib{Name: AttribWorkflowNamespace, Value: namespace}) } - m.AddInt(ctx, nameDeprecated, 1, attribs) + m.AddInt(ctx, InstrumentDeprecatedFeature.Name(), 1, attribs) } diff --git a/util/telemetry/metrics_list.go b/util/telemetry/metrics_list.go new file mode 100644 index 000000000000..0217b45f0d20 --- /dev/null +++ b/util/telemetry/metrics_list.go @@ -0,0 +1,420 @@ +// Code generated by util/telemetry/builder. DO NOT EDIT. +package telemetry + +var InstrumentCronworkflowsConcurrencypolicyTriggered = BuiltinInstrument{ + name: "cronworkflows_concurrencypolicy_triggered", + description: "A counter of the number of times a CronWorkflow has triggered its `concurrencyPolicy` to limit the number of workflows running", + unit: "{cronworkflow}", + instType: Int64Counter, + attributes: []BuiltinAttribute{ + { + name: AttribCronWFName, + }, + { + name: AttribCronWFNamespace, + }, + { + name: AttribConcurrencyPolicy, + }, + }, +} + +var InstrumentCronworkflowsTriggeredTotal = BuiltinInstrument{ + name: "cronworkflows_triggered_total", + description: "A counter of the total number of times a CronWorkflow has been triggered", + unit: "{cronworkflow}", + instType: Int64Counter, + attributes: []BuiltinAttribute{ + { + name: AttribCronWFName, + }, + { + name: AttribCronWFNamespace, + }, + }, +} + +var InstrumentDeprecatedFeature = BuiltinInstrument{ + name: "deprecated_feature", + description: "Incidents of deprecated feature being used", + unit: "{feature}", + instType: Int64Counter, + attributes: []BuiltinAttribute{ + { + name: AttribDeprecatedFeature, + }, + { + name: AttribWorkflowNamespace, + optional: true, + }, + }, +} + +var InstrumentErrorCount = BuiltinInstrument{ + name: "error_count", + description: "A counter of certain errors incurred by the controller by cause", + unit: "{error}", + instType: Int64Counter, + attributes: []BuiltinAttribute{ + { + name: AttribErrorCause, + }, + }, +} + +var InstrumentGauge = BuiltinInstrument{ + name: "gauge", + description: "A gauge of the number of workflows currently in the cluster in each phase", + unit: "{workflow}", + instType: Int64ObservableGauge, + attributes: []BuiltinAttribute{ + { + name: AttribWorkflowStatus, + }, + }, +} + +var InstrumentIsLeader = BuiltinInstrument{ + name: "is_leader", + description: "Emits 1 if leader, 0 otherwise. Always 1 if leader election is disabled", + unit: "{leader}", + instType: Int64ObservableGauge, +} + +var InstrumentK8sRequestDuration = BuiltinInstrument{ + name: "k8s_request_duration", + description: "A histogram recording the API requests sent to the Kubernetes API", + unit: "s", + instType: Float64Histogram, + attributes: []BuiltinAttribute{ + { + name: AttribRequestKind, + }, + { + name: AttribRequestVerb, + }, + { + name: AttribRequestCode, + }, + }, + defaultBuckets: []float64{ + 0.100000, + 0.200000, + 0.500000, + 1.000000, + 2.000000, + 5.000000, + 10.000000, + 20.000000, + 60.000000, + 180.000000, + }, +} + +var InstrumentK8sRequestTotal = BuiltinInstrument{ + name: "k8s_request_total", + description: "A counter of the number of API requests sent to the Kubernetes API", + unit: "{request}", + instType: Int64Counter, + attributes: []BuiltinAttribute{ + { + name: AttribRequestKind, + }, + { + name: AttribRequestVerb, + }, + { + name: AttribRequestCode, + }, + }, +} + +var InstrumentLogMessages = BuiltinInstrument{ + name: "log_messages", + description: "A count of log messages emitted by the controller by log level: `error`, `warn` and `info`", + unit: "{message}", + instType: Int64Counter, + attributes: []BuiltinAttribute{ + { + name: AttribLogLevel, + }, + }, +} + +var InstrumentOperationDurationSeconds = BuiltinInstrument{ + name: "operation_duration_seconds", + description: "A histogram of durations of operations", + unit: "s", + instType: Float64Histogram, +} + +var InstrumentPodMissing = BuiltinInstrument{ + name: "pod_missing", + description: "Incidents of pod missing", + unit: "{pod}", + instType: Int64Counter, + attributes: []BuiltinAttribute{ + { + name: AttribNodePhase, + }, + { + name: AttribRecentlyStarted, + }, + }, +} + +var InstrumentPodPendingCount = BuiltinInstrument{ + name: "pod_pending_count", + description: "Total number of pods that started pending by reason", + unit: "{pod}", + instType: Int64Counter, + attributes: []BuiltinAttribute{ + { + name: AttribPodPendingReason, + }, + { + name: AttribPodNamespace, + }, + }, +} + +var InstrumentPodsGauge = BuiltinInstrument{ + name: "pods_gauge", + description: "A gauge of the number of workflow created pods currently in the cluster in each phase", + unit: "{pod}", + instType: Int64ObservableGauge, + attributes: []BuiltinAttribute{ + { + name: AttribPodPhase, + }, + }, +} + +var InstrumentPodsTotalCount = BuiltinInstrument{ + name: "pods_total_count", + description: "Total number of pods that have entered each phase", + unit: "{pod}", + instType: Int64ObservableGauge, + attributes: []BuiltinAttribute{ + { + name: AttribPodPhase, + }, + { + name: AttribPodNamespace, + }, + }, +} + +var InstrumentQueueAddsCount = BuiltinInstrument{ + name: "queue_adds_count", + description: "A counter of additions to the work queues inside the controller", + unit: "{item}", + instType: Int64Counter, + attributes: []BuiltinAttribute{ + { + name: AttribQueueName, + }, + }, +} + +var InstrumentQueueDepthGauge = BuiltinInstrument{ + name: "queue_depth_gauge", + description: "A gauge of the current depth of the queues", + unit: "{item}", + instType: Int64UpDownCounter, + attributes: []BuiltinAttribute{ + { + name: AttribQueueName, + }, + }, +} + +var InstrumentQueueDuration = BuiltinInstrument{ + name: "queue_duration", + description: "A histogram of the time events in the queues are taking to be processed", + unit: "s", + instType: Float64Histogram, + attributes: []BuiltinAttribute{ + { + name: AttribQueueName, + }, + }, + defaultBuckets: []float64{ + 0.100000, + 0.200000, + 0.500000, + 1.000000, + 2.000000, + 5.000000, + 10.000000, + 20.000000, + 60.000000, + 180.000000, + }, +} + +var InstrumentQueueLatency = BuiltinInstrument{ + name: "queue_latency", + description: "A histogram of the time events in the queues are taking before they are processed", + unit: "s", + instType: Float64Histogram, + attributes: []BuiltinAttribute{ + { + name: AttribQueueName, + }, + }, + defaultBuckets: []float64{ + 1.000000, + 5.000000, + 20.000000, + 60.000000, + 180.000000, + }, +} + +var InstrumentQueueLongestRunning = BuiltinInstrument{ + name: "queue_longest_running", + description: "A gauge of the number of seconds that this queue's longest running processor has been running for", + unit: "s", + instType: Float64ObservableGauge, + attributes: []BuiltinAttribute{ + { + name: AttribQueueName, + }, + }, +} + +var InstrumentQueueRetries = BuiltinInstrument{ + name: "queue_retries", + description: "A counter of the number of times a message has been retried in the queue", + unit: "{item}", + instType: Int64Counter, + attributes: []BuiltinAttribute{ + { + name: AttribQueueName, + }, + }, +} + +var InstrumentQueueUnfinishedWork = BuiltinInstrument{ + name: "queue_unfinished_work", + description: "A gauge of the number of queue items that have not been processed yet", + unit: "{item}", + instType: Float64ObservableGauge, + attributes: []BuiltinAttribute{ + { + name: AttribQueueName, + }, + }, +} + +var InstrumentTotalCount = BuiltinInstrument{ + name: "total_count", + description: "A counter of workflows that have entered each phase for tracking them through their life-cycle, by namespace", + unit: "{workflow}", + instType: Int64Counter, + attributes: []BuiltinAttribute{ + { + name: AttribWorkflowPhase, + }, + { + name: AttribWorkflowNamespace, + }, + }, +} + +var InstrumentVersion = BuiltinInstrument{ + name: "version", + description: "Build metadata for this Controller", + unit: "{unused}", + instType: Int64Counter, + attributes: []BuiltinAttribute{ + { + name: AttribBuildVersion, + }, + { + name: AttribBuildPlatform, + }, + { + name: AttribBuildGoVersion, + }, + { + name: AttribBuildDate, + }, + { + name: AttribBuildCompiler, + }, + { + name: AttribBuildGitCommit, + }, + { + name: AttribBuildGitTreeState, + }, + { + name: AttribBuildGitTag, + }, + }, +} + +var InstrumentWorkersBusyCount = BuiltinInstrument{ + name: "workers_busy_count", + description: "A gauge of queue workers that are busy", + unit: "{worker}", + instType: Int64UpDownCounter, + attributes: []BuiltinAttribute{ + { + name: AttribWorkerType, + }, + }, +} + +var InstrumentWorkflowCondition = BuiltinInstrument{ + name: "workflow_condition", + description: "A gauge of the number of workflows with different conditions", + unit: "{workflow}", + instType: Int64ObservableGauge, + attributes: []BuiltinAttribute{ + { + name: AttribWorkflowType, + }, + { + name: AttribWorkflowStatus, + }, + }, +} + +var InstrumentWorkflowtemplateRuntime = BuiltinInstrument{ + name: "workflowtemplate_runtime", + description: "A histogram of the runtime of workflows using `workflowTemplateRef` only", + unit: "s", + instType: Float64Histogram, + attributes: []BuiltinAttribute{ + { + name: AttribTemplateName, + }, + { + name: AttribTemplateNamespace, + }, + { + name: AttribTemplateCluster, + }, + }, +} + +var InstrumentWorkflowtemplateTriggeredTotal = BuiltinInstrument{ + name: "workflowtemplate_triggered_total", + description: "A counter of workflows using `workflowTemplateRef` only, as they enter each phase", + unit: "{workflow_template}", + instType: Int64Counter, + attributes: []BuiltinAttribute{ + { + name: AttribTemplateName, + }, + { + name: AttribTemplateNamespace, + }, + { + name: AttribTemplateCluster, + }, + }, +} diff --git a/util/telemetry/version.go b/util/telemetry/version.go index 055aa038bf74..fb951c729dce 100644 --- a/util/telemetry/version.go +++ b/util/telemetry/version.go @@ -7,19 +7,13 @@ import ( ) func AddVersion(ctx context.Context, m *Metrics) error { - const nameVersion = `version` - err := m.CreateInstrument(Int64Counter, - nameVersion, - "Build metadata for this Controller", - "{unused}", - WithAsBuiltIn(), - ) + err := m.CreateBuiltinInstrument(InstrumentVersion) if err != nil { return err } version := argo.GetVersion() - m.AddInt(ctx, nameVersion, 1, InstAttribs{ + m.AddInt(ctx, InstrumentVersion.Name(), 1, InstAttribs{ {Name: AttribBuildVersion, Value: version.Version}, {Name: AttribBuildPlatform, Value: version.Platform}, {Name: AttribBuildGoVersion, Value: version.GoVersion}, diff --git a/workflow/metrics/counter_cronworkflow_policy.go b/workflow/metrics/counter_cronworkflow_policy.go index d77e5172dfeb..bf2aad2f857f 100644 --- a/workflow/metrics/counter_cronworkflow_policy.go +++ b/workflow/metrics/counter_cronworkflow_policy.go @@ -7,23 +7,14 @@ import ( "github.com/argoproj/argo-workflows/v3/util/telemetry" ) -const ( - nameCronPolicy = `cronworkflows_concurrencypolicy_triggered` -) - func addCronWfPolicyCounter(_ context.Context, m *Metrics) error { - return m.CreateInstrument(telemetry.Int64Counter, - nameCronPolicy, - "Total number of times CronWorkflow concurrencyPolicy has triggered", - "{cronworkflow}", - telemetry.WithAsBuiltIn(), - ) + return m.CreateBuiltinInstrument(telemetry.InstrumentCronworkflowsConcurrencypolicyTriggered) } func (m *Metrics) CronWfPolicy(ctx context.Context, name, namespace string, policy wfv1.ConcurrencyPolicy) { - m.AddInt(ctx, nameCronPolicy, 1, telemetry.InstAttribs{ + m.AddInt(ctx, telemetry.InstrumentCronworkflowsConcurrencypolicyTriggered.Name(), 1, telemetry.InstAttribs{ {Name: telemetry.AttribCronWFName, Value: name}, - {Name: telemetry.AttribWorkflowNamespace, Value: namespace}, + {Name: telemetry.AttribCronWFNamespace, Value: namespace}, {Name: telemetry.AttribConcurrencyPolicy, Value: string(policy)}, }) } diff --git a/workflow/metrics/counter_cronworkflow_trigger.go b/workflow/metrics/counter_cronworkflow_trigger.go index f77c488b6b36..a92333d57aeb 100644 --- a/workflow/metrics/counter_cronworkflow_trigger.go +++ b/workflow/metrics/counter_cronworkflow_trigger.go @@ -6,22 +6,13 @@ import ( "github.com/argoproj/argo-workflows/v3/util/telemetry" ) -const ( - nameCronTriggered = `cronworkflows_triggered_total` -) - func addCronWfTriggerCounter(_ context.Context, m *Metrics) error { - return m.CreateInstrument(telemetry.Int64Counter, - nameCronTriggered, - "Total number of cron workflows triggered", - "{cronworkflow}", - telemetry.WithAsBuiltIn(), - ) + return m.CreateBuiltinInstrument(telemetry.InstrumentCronworkflowsTriggeredTotal) } func (m *Metrics) CronWfTrigger(ctx context.Context, name, namespace string) { - m.AddInt(ctx, nameCronTriggered, 1, telemetry.InstAttribs{ + m.AddInt(ctx, telemetry.InstrumentCronworkflowsTriggeredTotal.Name(), 1, telemetry.InstAttribs{ {Name: telemetry.AttribCronWFName, Value: name}, - {Name: telemetry.AttribWorkflowNamespace, Value: namespace}, + {Name: telemetry.AttribCronWFNamespace, Value: namespace}, }) } diff --git a/workflow/metrics/counter_error.go b/workflow/metrics/counter_error.go index f71f7e6d8315..39a6f1160df5 100644 --- a/workflow/metrics/counter_error.go +++ b/workflow/metrics/counter_error.go @@ -9,37 +9,31 @@ import ( type ErrorCause string const ( - nameErrorCount = `error_count` ErrorCauseOperationPanic ErrorCause = "OperationPanic" ErrorCauseCronWorkflowSubmissionError ErrorCause = "CronWorkflowSubmissionError" ErrorCauseCronWorkflowSpecError ErrorCause = "CronWorkflowSpecError" ) func addErrorCounter(ctx context.Context, m *Metrics) error { - err := m.CreateInstrument(telemetry.Int64Counter, - nameErrorCount, - "Number of errors encountered by the controller by cause", - "{error}", - telemetry.WithAsBuiltIn(), - ) + err := m.CreateBuiltinInstrument(telemetry.InstrumentErrorCount) if err != nil { return err } // Initialise all values to zero for _, cause := range []ErrorCause{ErrorCauseOperationPanic, ErrorCauseCronWorkflowSubmissionError, ErrorCauseCronWorkflowSpecError} { - m.AddInt(ctx, nameErrorCount, 0, telemetry.InstAttribs{{Name: telemetry.AttribErrorCause, Value: string(cause)}}) + m.AddInt(ctx, telemetry.InstrumentErrorCount.Name(), 0, telemetry.InstAttribs{{Name: telemetry.AttribErrorCause, Value: string(cause)}}) } return nil } func (m *Metrics) OperationPanic(ctx context.Context) { - m.AddInt(ctx, nameErrorCount, 1, telemetry.InstAttribs{{Name: telemetry.AttribErrorCause, Value: string(ErrorCauseOperationPanic)}}) + m.AddInt(ctx, telemetry.InstrumentErrorCount.Name(), 1, telemetry.InstAttribs{{Name: telemetry.AttribErrorCause, Value: string(ErrorCauseOperationPanic)}}) } func (m *Metrics) CronWorkflowSubmissionError(ctx context.Context) { - m.AddInt(ctx, nameErrorCount, 1, telemetry.InstAttribs{{Name: telemetry.AttribErrorCause, Value: string(ErrorCauseCronWorkflowSubmissionError)}}) + m.AddInt(ctx, telemetry.InstrumentErrorCount.Name(), 1, telemetry.InstAttribs{{Name: telemetry.AttribErrorCause, Value: string(ErrorCauseCronWorkflowSubmissionError)}}) } func (m *Metrics) CronWorkflowSpecError(ctx context.Context) { - m.AddInt(ctx, nameErrorCount, 1, telemetry.InstAttribs{{Name: telemetry.AttribErrorCause, Value: string(ErrorCauseCronWorkflowSpecError)}}) + m.AddInt(ctx, telemetry.InstrumentErrorCount.Name(), 1, telemetry.InstAttribs{{Name: telemetry.AttribErrorCause, Value: string(ErrorCauseCronWorkflowSpecError)}}) } diff --git a/workflow/metrics/counter_log.go b/workflow/metrics/counter_log.go index b9cea55952ab..dd3a37061dba 100644 --- a/workflow/metrics/counter_log.go +++ b/workflow/metrics/counter_log.go @@ -13,19 +13,14 @@ type logMetric struct { } func addLogCounter(ctx context.Context, m *Metrics) error { - const nameLogMessages = `log_messages` - err := m.CreateInstrument(telemetry.Int64Counter, - nameLogMessages, - "Total number of log messages.", - "{message}", - telemetry.WithAsBuiltIn(), - ) + err := m.CreateBuiltinInstrument(telemetry.InstrumentLogMessages) + name := telemetry.InstrumentLogMessages.Name() lm := logMetric{ - counter: m.AllInstruments[nameLogMessages], + counter: m.AllInstruments[name], } log.AddHook(lm) for _, level := range lm.Levels() { - m.AddInt(ctx, nameLogMessages, 0, telemetry.InstAttribs{ + m.AddInt(ctx, name, 0, telemetry.InstAttribs{ {Name: telemetry.AttribLogLevel, Value: level.String()}, }) } diff --git a/workflow/metrics/counter_pod_missing.go b/workflow/metrics/counter_pod_missing.go index 8f3b227d8ef0..6b51cd2ef98d 100644 --- a/workflow/metrics/counter_pod_missing.go +++ b/workflow/metrics/counter_pod_missing.go @@ -6,21 +6,12 @@ import ( "github.com/argoproj/argo-workflows/v3/util/telemetry" ) -const ( - namePodMissing = `pod_missing` -) - func addPodMissingCounter(_ context.Context, m *Metrics) error { - return m.CreateInstrument(telemetry.Int64Counter, - namePodMissing, - "Incidents of pod missing.", - "{pod}", - telemetry.WithAsBuiltIn(), - ) + return m.CreateBuiltinInstrument(telemetry.InstrumentPodMissing) } func (m *Metrics) incPodMissing(ctx context.Context, val int64, recentlyStarted bool, phase string) { - m.AddInt(ctx, namePodMissing, val, telemetry.InstAttribs{ + m.AddInt(ctx, telemetry.InstrumentPodMissing.Name(), val, telemetry.InstAttribs{ {Name: telemetry.AttribRecentlyStarted, Value: recentlyStarted}, {Name: telemetry.AttribNodePhase, Value: phase}, }) diff --git a/workflow/metrics/counter_pod_pending.go b/workflow/metrics/counter_pod_pending.go index 5138df86abea..d9f485b28e32 100644 --- a/workflow/metrics/counter_pod_pending.go +++ b/workflow/metrics/counter_pod_pending.go @@ -7,17 +7,8 @@ import ( "github.com/argoproj/argo-workflows/v3/util/telemetry" ) -const ( - namePodPending = `pod_pending_count` -) - func addPodPendingCounter(_ context.Context, m *Metrics) error { - return m.CreateInstrument(telemetry.Int64Counter, - namePodPending, - "Total number of pods that started pending by reason", - "{pod}", - telemetry.WithAsBuiltIn(), - ) + return m.CreateBuiltinInstrument(telemetry.InstrumentPodPendingCount) } func (m *Metrics) ChangePodPending(ctx context.Context, reason, namespace string) { @@ -30,7 +21,7 @@ func (m *Metrics) ChangePodPending(ctx context.Context, reason, namespace string // the pod_phase metric can cope with this being visible return default: - m.AddInt(ctx, namePodPending, 1, telemetry.InstAttribs{ + m.AddInt(ctx, telemetry.InstrumentPodPendingCount.Name(), 1, telemetry.InstAttribs{ {Name: telemetry.AttribPodPendingReason, Value: splitReason[0]}, {Name: telemetry.AttribPodNamespace, Value: namespace}, }) diff --git a/workflow/metrics/counter_pod_phase.go b/workflow/metrics/counter_pod_phase.go index 37686c5fc4c3..1b644d389ae2 100644 --- a/workflow/metrics/counter_pod_phase.go +++ b/workflow/metrics/counter_pod_phase.go @@ -6,21 +6,12 @@ import ( "github.com/argoproj/argo-workflows/v3/util/telemetry" ) -const ( - namePodPhase = `pods_total_count` -) - func addPodPhaseCounter(_ context.Context, m *Metrics) error { - return m.CreateInstrument(telemetry.Int64Counter, - namePodPhase, - "Total number of Pods that have entered each phase", - "{pod}", - telemetry.WithAsBuiltIn(), - ) + return m.CreateBuiltinInstrument(telemetry.InstrumentPodsTotalCount) } func (m *Metrics) ChangePodPhase(ctx context.Context, phase, namespace string) { - m.AddInt(ctx, namePodPhase, 1, telemetry.InstAttribs{ + m.AddInt(ctx, telemetry.InstrumentPodsTotalCount.Name(), 1, telemetry.InstAttribs{ {Name: telemetry.AttribPodPhase, Value: phase}, {Name: telemetry.AttribPodNamespace, Value: namespace}, }) diff --git a/workflow/metrics/counter_template.go b/workflow/metrics/counter_template.go index f1cb3500cab3..f7f8a6a8b37e 100644 --- a/workflow/metrics/counter_template.go +++ b/workflow/metrics/counter_template.go @@ -6,17 +6,8 @@ import ( "github.com/argoproj/argo-workflows/v3/util/telemetry" ) -const ( - nameWFTemplateTriggered = `workflowtemplate_triggered_total` -) - func addWorkflowTemplateCounter(_ context.Context, m *Metrics) error { - return m.CreateInstrument(telemetry.Int64Counter, - nameWFTemplateTriggered, - "Total number of workflow templates triggered by workflowTemplateRef", - "{workflow_template}", - telemetry.WithAsBuiltIn(), - ) + return m.CreateBuiltinInstrument(telemetry.InstrumentWorkflowtemplateTriggeredTotal) } func templateAttribs(name, namespace string, cluster bool) telemetry.InstAttribs { @@ -31,5 +22,5 @@ func (m *Metrics) CountWorkflowTemplate(ctx context.Context, phase MetricWorkflo attribs := templateAttribs(name, namespace, cluster) attribs = append(attribs, telemetry.InstAttrib{Name: telemetry.AttribWorkflowPhase, Value: string(phase)}) - m.AddInt(ctx, nameWFTemplateTriggered, 1, attribs) + m.AddInt(ctx, telemetry.InstrumentWorkflowtemplateTriggeredTotal.Name(), 1, attribs) } diff --git a/workflow/metrics/counter_workflow_phase.go b/workflow/metrics/counter_workflow_phase.go index 91c56eef4853..b95439aaa0c3 100644 --- a/workflow/metrics/counter_workflow_phase.go +++ b/workflow/metrics/counter_workflow_phase.go @@ -7,10 +7,6 @@ import ( "github.com/argoproj/argo-workflows/v3/util/telemetry" ) -const ( - nameWorkflowPhaseCounter = `total_count` -) - type MetricWorkflowPhase string const ( @@ -41,16 +37,11 @@ func ConvertWorkflowPhase(inPhase wfv1.WorkflowPhase) MetricWorkflowPhase { } func addWorkflowPhaseCounter(_ context.Context, m *Metrics) error { - return m.CreateInstrument(telemetry.Int64Counter, - nameWorkflowPhaseCounter, - "Total number of workflows that have entered each phase", - "{workflow}", - telemetry.WithAsBuiltIn(), - ) + return m.CreateBuiltinInstrument(telemetry.InstrumentTotalCount) } func (m *Metrics) ChangeWorkflowPhase(ctx context.Context, phase MetricWorkflowPhase, namespace string) { - m.AddInt(ctx, nameWorkflowPhaseCounter, 1, telemetry.InstAttribs{ + m.AddInt(ctx, telemetry.InstrumentTotalCount.Name(), 1, telemetry.InstAttribs{ {Name: telemetry.AttribWorkflowPhase, Value: string(phase)}, {Name: telemetry.AttribWorkflowNamespace, Value: namespace}, }) diff --git a/workflow/metrics/gauge_pod_phase.go b/workflow/metrics/gauge_pod_phase.go index b93e9c33729b..ac401ca20ccf 100644 --- a/workflow/metrics/gauge_pod_phase.go +++ b/workflow/metrics/gauge_pod_phase.go @@ -17,23 +17,18 @@ type podPhaseGauge struct { } func addPodPhaseGauge(ctx context.Context, m *Metrics) error { - const namePodsPhase = `pods_gauge` - err := m.CreateInstrument(telemetry.Int64ObservableGauge, - namePodsPhase, - "Number of Pods from Workflows currently accessible by the controller by status.", - "{pod}", - telemetry.WithAsBuiltIn(), - ) + err := m.CreateBuiltinInstrument(telemetry.InstrumentPodsGauge) if err != nil { return err } + name := telemetry.InstrumentPodsGauge.Name() if m.callbacks.PodPhase != nil { ppGauge := podPhaseGauge{ callback: m.callbacks.PodPhase, - gauge: m.AllInstruments[namePodsPhase], + gauge: m.AllInstruments[name], } - return m.AllInstruments[namePodsPhase].RegisterCallback(m.Metrics, ppGauge.update) + return m.AllInstruments[name].RegisterCallback(m.Metrics, ppGauge.update) } return nil } diff --git a/workflow/metrics/gauge_workflow_condition.go b/workflow/metrics/gauge_workflow_condition.go index 3709cd07d9d1..0c174f6456a0 100644 --- a/workflow/metrics/gauge_workflow_condition.go +++ b/workflow/metrics/gauge_workflow_condition.go @@ -18,13 +18,7 @@ type workflowConditionGauge struct { } func addWorkflowConditionGauge(_ context.Context, m *Metrics) error { - const nameWorkflowCondition = `workflow_condition` - err := m.CreateInstrument(telemetry.Int64ObservableGauge, - nameWorkflowCondition, - "Workflow condition.", - "{unit}", - telemetry.WithAsBuiltIn(), - ) + err := m.CreateBuiltinInstrument(telemetry.InstrumentWorkflowCondition) if err != nil { return err } @@ -32,9 +26,9 @@ func addWorkflowConditionGauge(_ context.Context, m *Metrics) error { if m.callbacks.WorkflowCondition != nil { wfcGauge := workflowConditionGauge{ callback: m.callbacks.WorkflowCondition, - gauge: m.AllInstruments[nameWorkflowCondition], + gauge: m.AllInstruments[telemetry.InstrumentWorkflowCondition.Name()], } - return m.AllInstruments[nameWorkflowCondition].RegisterCallback(m.Metrics, wfcGauge.update) + return m.AllInstruments[telemetry.InstrumentWorkflowCondition.Name()].RegisterCallback(m.Metrics, wfcGauge.update) } return nil // TODO init all phases? diff --git a/workflow/metrics/gauge_workflow_phase.go b/workflow/metrics/gauge_workflow_phase.go index 59a6e670e413..997d7088f084 100644 --- a/workflow/metrics/gauge_workflow_phase.go +++ b/workflow/metrics/gauge_workflow_phase.go @@ -17,23 +17,18 @@ type workflowPhaseGauge struct { } func addWorkflowPhaseGauge(_ context.Context, m *Metrics) error { - const nameWorkflowPhaseGauge = `gauge` - err := m.CreateInstrument(telemetry.Int64ObservableGauge, - nameWorkflowPhaseGauge, - "number of Workflows currently accessible by the controller by status", - "{workflow}", - telemetry.WithAsBuiltIn(), - ) + err := m.CreateBuiltinInstrument(telemetry.InstrumentGauge) if err != nil { return err } + name := telemetry.InstrumentGauge.Name() if m.callbacks.WorkflowPhase != nil { wfpGauge := workflowPhaseGauge{ callback: m.callbacks.WorkflowPhase, - gauge: m.AllInstruments[nameWorkflowPhaseGauge], + gauge: m.AllInstruments[name], } - return m.AllInstruments[nameWorkflowPhaseGauge].RegisterCallback(m.Metrics, wfpGauge.update) + return m.AllInstruments[name].RegisterCallback(m.Metrics, wfpGauge.update) } return nil // TODO init all phases? diff --git a/workflow/metrics/histogram_durations.go b/workflow/metrics/histogram_durations.go index 6b6038e31ff2..43e8979d4761 100644 --- a/workflow/metrics/histogram_durations.go +++ b/workflow/metrics/histogram_durations.go @@ -13,7 +13,6 @@ import ( ) const ( - nameOperationDuration = `operation_duration_seconds` operationDurationDefaultBucketCount = 6 ) @@ -26,15 +25,10 @@ func addOperationDurationHistogram(_ context.Context, m *Metrics) error { } bucketWidth := maxOperationTimeSeconds / float64(operationDurationMetricBucketCount) // The buckets here are only the 'defaults' and can be overridden with configmap defaults - return m.CreateInstrument(telemetry.Float64Histogram, - nameOperationDuration, - "Histogram of durations of operations", - "s", - telemetry.WithDefaultBuckets(prometheus.LinearBuckets(bucketWidth, bucketWidth, operationDurationMetricBucketCount)), - telemetry.WithAsBuiltIn(), - ) + return m.CreateBuiltinInstrument(telemetry.InstrumentOperationDurationSeconds, + telemetry.WithDefaultBuckets(prometheus.LinearBuckets(bucketWidth, bucketWidth, operationDurationMetricBucketCount))) } func (m *Metrics) OperationCompleted(ctx context.Context, durationSeconds float64) { - m.Record(ctx, nameOperationDuration, durationSeconds, telemetry.InstAttribs{}) + m.Record(ctx, telemetry.InstrumentOperationDurationSeconds.Name(), durationSeconds, telemetry.InstAttribs{}) } diff --git a/workflow/metrics/histogram_template.go b/workflow/metrics/histogram_template.go index a466b9112fa3..bb87145a71cf 100644 --- a/workflow/metrics/histogram_template.go +++ b/workflow/metrics/histogram_template.go @@ -7,19 +7,10 @@ import ( "github.com/argoproj/argo-workflows/v3/util/telemetry" ) -const ( - nameWorkflowTemplateRuntime = `workflowtemplate_runtime` -) - func addWorkflowTemplateHistogram(_ context.Context, m *Metrics) error { - return m.CreateInstrument(telemetry.Float64Histogram, - nameWorkflowTemplateRuntime, - "Duration of workflow template runs run through workflowTemplateRefs", - "s", - telemetry.WithAsBuiltIn(), - ) + return m.CreateBuiltinInstrument(telemetry.InstrumentWorkflowtemplateRuntime) } func (m *Metrics) RecordWorkflowTemplateTime(ctx context.Context, duration time.Duration, name, namespace string, cluster bool) { - m.Record(ctx, nameWorkflowTemplateRuntime, duration.Seconds(), templateAttribs(name, namespace, cluster)) + m.Record(ctx, telemetry.InstrumentWorkflowtemplateRuntime.Name(), duration.Seconds(), templateAttribs(name, namespace, cluster)) } diff --git a/workflow/metrics/leader.go b/workflow/metrics/leader.go index ff3562b66b52..e91dc6c9c8a9 100644 --- a/workflow/metrics/leader.go +++ b/workflow/metrics/leader.go @@ -16,24 +16,19 @@ type leaderGauge struct { } func addIsLeader(ctx context.Context, m *Metrics) error { - const nameLeader = `is_leader` - err := m.CreateInstrument(telemetry.Int64ObservableGauge, - nameLeader, - "Emits 1 if leader, 0 otherwise. Always 1 if leader election is disabled.", - "{leader}", - telemetry.WithAsBuiltIn(), - ) + err := m.CreateBuiltinInstrument(telemetry.InstrumentIsLeader) if err != nil { return err } if m.callbacks.IsLeader == nil { return nil } + name := telemetry.InstrumentIsLeader.Name() lGauge := leaderGauge{ callback: m.callbacks.IsLeader, - gauge: m.AllInstruments[nameLeader], + gauge: m.AllInstruments[name], } - return m.AllInstruments[nameLeader].RegisterCallback(m.Metrics, lGauge.update) + return m.AllInstruments[name].RegisterCallback(m.Metrics, lGauge.update) } func (l *leaderGauge) update(_ context.Context, o metric.Observer) error { diff --git a/workflow/metrics/leader_test.go b/workflow/metrics/leader_test.go index 76514ab4fa51..0d7c59c7c32e 100644 --- a/workflow/metrics/leader_test.go +++ b/workflow/metrics/leader_test.go @@ -22,7 +22,7 @@ func TestIsLeader(t *testing.T) { require.NoError(t, err) assert.NotNil(t, te) attribs := attribute.NewSet() - val, err := te.GetInt64GaugeValue(`is_leader`, &attribs) + val, err := te.GetInt64GaugeValue(telemetry.InstrumentIsLeader.Name(), &attribs) require.NoError(t, err) assert.Equal(t, int64(1), val) } @@ -38,7 +38,7 @@ func TestNotLeader(t *testing.T) { require.NoError(t, err) assert.NotNil(t, te) attribs := attribute.NewSet() - val, err := te.GetInt64GaugeValue(`is_leader`, &attribs) + val, err := te.GetInt64GaugeValue(telemetry.InstrumentIsLeader.Name(), &attribs) require.NoError(t, err) assert.Equal(t, int64(0), val) } diff --git a/workflow/metrics/metrics_k8s_request.go b/workflow/metrics/metrics_k8s_request.go index 3294260823aa..29bdf769793f 100644 --- a/workflow/metrics/metrics_k8s_request.go +++ b/workflow/metrics/metrics_k8s_request.go @@ -11,28 +11,12 @@ import ( "github.com/argoproj/argo-workflows/v3/util/telemetry" ) -const ( - nameK8sRequestTotal = `k8s_request_total` - nameK8sRequestDuration = `k8s_request_duration` -) - func addK8sRequests(_ context.Context, m *Metrics) error { - err := m.CreateInstrument(telemetry.Int64Counter, - nameK8sRequestTotal, - "Number of kubernetes requests executed.", - "{request}", - telemetry.WithAsBuiltIn(), - ) + err := m.CreateBuiltinInstrument(telemetry.InstrumentK8sRequestTotal) if err != nil { return err } - err = m.CreateInstrument(telemetry.Float64Histogram, - nameK8sRequestDuration, - "Duration of kubernetes requests executed.", - "s", - telemetry.WithDefaultBuckets([]float64{0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0, 60.0, 180.0}), - telemetry.WithAsBuiltIn(), - ) + err = m.CreateBuiltinInstrument(telemetry.InstrumentK8sRequestDuration) // Register this metrics with the global k8sMetrics.metrics = m return err @@ -63,8 +47,8 @@ func (m metricsRoundTripper) RoundTrip(r *http.Request) (*http.Response, error) {Name: telemetry.AttribRequestVerb, Value: verb}, {Name: telemetry.AttribRequestCode, Value: x.StatusCode}, } - (*m.metrics).AddInt(m.ctx, nameK8sRequestTotal, 1, attribs) - (*m.metrics).Record(m.ctx, nameK8sRequestDuration, duration.Seconds(), attribs) + (*m.metrics).AddInt(m.ctx, telemetry.InstrumentK8sRequestTotal.Name(), 1, attribs) + (*m.metrics).Record(m.ctx, telemetry.InstrumentK8sRequestDuration.Name(), duration.Seconds(), attribs) } return x, err } diff --git a/workflow/metrics/metrics_test.go b/workflow/metrics/metrics_test.go index b4933e844737..31bbeb0ad99a 100644 --- a/workflow/metrics/metrics_test.go +++ b/workflow/metrics/metrics_test.go @@ -22,7 +22,7 @@ func TestMetrics(t *testing.T) { m.OperationCompleted(m.Ctx, 5) assert.NotNil(t, te) attribs := attribute.NewSet() - val, err := te.GetFloat64HistogramData(nameOperationDuration, &attribs) + val, err := te.GetFloat64HistogramData(telemetry.InstrumentOperationDurationSeconds.Name(), &attribs) require.NoError(t, err) assert.Equal(t, []float64{5, 10, 15, 20, 25, 30}, val.Bounds) assert.Equal(t, []uint64{1, 0, 0, 0, 0, 0, 0}, val.BucketCounts) @@ -151,13 +151,13 @@ func TestWorkflowQueueMetrics(t *testing.T) { wfQueue := m.RateLimiterWithBusyWorkers(m.Ctx, workqueue.DefaultTypedControllerRateLimiter[string](), "workflow_queue") defer wfQueue.ShutDown() - assert.NotNil(t, m.AllInstruments[nameWorkersQueueDepth]) - assert.NotNil(t, m.AllInstruments[nameWorkersQueueLatency]) + assert.NotNil(t, m.AllInstruments[telemetry.InstrumentQueueDepthGauge.Name()]) + assert.NotNil(t, m.AllInstruments[telemetry.InstrumentQueueLatency.Name()]) wfQueue.Add("hello") - require.NotNil(t, m.AllInstruments[nameWorkersQueueAdds]) - val, err := te.GetInt64CounterValue(nameWorkersQueueAdds, &attribs) + require.NotNil(t, m.AllInstruments[telemetry.InstrumentQueueAddsCount.Name()]) + val, err := te.GetInt64CounterValue(telemetry.InstrumentQueueAddsCount.Name(), &attribs) require.NoError(t, err) assert.Equal(t, int64(1), val) } diff --git a/workflow/metrics/work_queue.go b/workflow/metrics/work_queue.go index 41e1ee6f85c5..312a858961f3 100644 --- a/workflow/metrics/work_queue.go +++ b/workflow/metrics/work_queue.go @@ -11,17 +11,6 @@ import ( "k8s.io/utils/ptr" ) -const ( - nameWorkersBusy = `workers_busy_count` - nameWorkersQueueDepth = `queue_depth_gauge` - nameWorkersQueueAdds = `queue_adds_count` - nameWorkersQueueLatency = `queue_latency` - nameWorkersQueueDuration = `queue_duration` - nameWorkersRetries = `queue_retries` - nameWorkersUnfinishedWork = `queue_unfinished_work` - nameWorkersLongestRunning = `queue_longest_running` -) - // Act as a metrics provider for a workqueues var _ workqueue.MetricsProvider = &Metrics{} @@ -34,94 +23,51 @@ type workersBusyRateLimiterWorkQueue struct { } func addWorkQueueMetrics(_ context.Context, m *Metrics) error { - err := m.CreateInstrument(telemetry.Int64UpDownCounter, - nameWorkersBusy, - "Number of workers currently busy", - "{worker}", - telemetry.WithAsBuiltIn(), - ) + err := m.CreateBuiltinInstrument(telemetry.InstrumentWorkersBusyCount) if err != nil { return err } - err = m.CreateInstrument(telemetry.Int64UpDownCounter, - nameWorkersQueueDepth, - "Depth of the queue", - "{item}", - telemetry.WithAsBuiltIn(), - ) + err = m.CreateBuiltinInstrument(telemetry.InstrumentQueueDepthGauge) if err != nil { return err } - err = m.CreateInstrument(telemetry.Int64Counter, - nameWorkersQueueAdds, - "Adds to the queue", - "{item}", - telemetry.WithAsBuiltIn(), - ) + err = m.CreateBuiltinInstrument(telemetry.InstrumentQueueAddsCount) if err != nil { return err } - err = m.CreateInstrument(telemetry.Float64Histogram, - nameWorkersQueueLatency, - "Time objects spend waiting in the queue", - "s", - telemetry.WithDefaultBuckets([]float64{1.0, 5.0, 20.0, 60.0, 180.0}), - telemetry.WithAsBuiltIn(), - ) + err = m.CreateBuiltinInstrument(telemetry.InstrumentQueueLatency) if err != nil { return err } - err = m.CreateInstrument(telemetry.Float64Histogram, - nameWorkersQueueDuration, - "Time objects spend being processed from the queue", - "s", - telemetry.WithDefaultBuckets([]float64{0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0, 60.0, 180.0}), - telemetry.WithAsBuiltIn(), - ) + err = m.CreateBuiltinInstrument(telemetry.InstrumentQueueDuration) if err != nil { return err } - err = m.CreateInstrument(telemetry.Int64Counter, - nameWorkersRetries, - "Retries in the queues", - "{item}", - telemetry.WithAsBuiltIn(), - ) + err = m.CreateBuiltinInstrument(telemetry.InstrumentQueueRetries) if err != nil { return err } - err = m.CreateInstrument(telemetry.Float64ObservableGauge, - nameWorkersUnfinishedWork, - "Unfinished work time", - "s", - telemetry.WithAsBuiltIn(), - ) + err = m.CreateBuiltinInstrument(telemetry.InstrumentQueueUnfinishedWork) if err != nil { return err } unfinishedCallback := queueUserdata{ - gauge: m.AllInstruments[nameWorkersUnfinishedWork], - } - m.AllInstruments[nameWorkersUnfinishedWork].SetUserdata(&unfinishedCallback) - err = m.AllInstruments[nameWorkersUnfinishedWork].RegisterCallback(m.Metrics, unfinishedCallback.update) + gauge: m.AllInstruments[telemetry.InstrumentQueueUnfinishedWork.Name()]} + m.AllInstruments[telemetry.InstrumentQueueUnfinishedWork.Name()].SetUserdata(&unfinishedCallback) + err = m.AllInstruments[telemetry.InstrumentQueueUnfinishedWork.Name()].RegisterCallback(m.Metrics, unfinishedCallback.update) if err != nil { return err } - err = m.CreateInstrument(telemetry.Float64ObservableGauge, - nameWorkersLongestRunning, - "Longest running worker", - "s", - telemetry.WithAsBuiltIn(), - ) + err = m.CreateBuiltinInstrument(telemetry.InstrumentQueueLongestRunning) if err != nil { return err } longestRunningCallback := queueUserdata{ - gauge: m.AllInstruments[nameWorkersLongestRunning], + gauge: m.AllInstruments[telemetry.InstrumentQueueLongestRunning.Name()], } - m.AllInstruments[nameWorkersLongestRunning].SetUserdata(&longestRunningCallback) - err = m.AllInstruments[nameWorkersLongestRunning].RegisterCallback(m.Metrics, longestRunningCallback.update) + m.AllInstruments[telemetry.InstrumentQueueLongestRunning.Name()].SetUserdata(&longestRunningCallback) + err = m.AllInstruments[telemetry.InstrumentQueueLongestRunning.Name()].RegisterCallback(m.Metrics, longestRunningCallback.update) if err != nil { return err } @@ -132,7 +78,7 @@ func (m *Metrics) RateLimiterWithBusyWorkers(ctx context.Context, workQueue work queue := workersBusyRateLimiterWorkQueue{ TypedRateLimitingInterface: workqueue.NewTypedRateLimitingQueueWithConfig(workQueue, workqueue.TypedRateLimitingQueueConfig[string]{Name: queueName}), workerType: queueName, - busyGauge: m.AllInstruments[nameWorkersBusy], + busyGauge: m.AllInstruments[telemetry.InstrumentWorkersBusyCount.Name()], ctx: ctx, } queue.newWorker(ctx) @@ -221,7 +167,7 @@ func (m *Metrics) NewDepthMetric(name string) workqueue.GaugeMetric { return queueMetric{ ctx: m.Ctx, name: name, - inst: m.AllInstruments[nameWorkersQueueDepth], + inst: m.AllInstruments[telemetry.InstrumentQueueDepthGauge.Name()], } } @@ -229,7 +175,7 @@ func (m *Metrics) NewAddsMetric(name string) workqueue.CounterMetric { return queueMetric{ ctx: m.Ctx, name: name, - inst: m.AllInstruments[nameWorkersQueueAdds], + inst: m.AllInstruments[telemetry.InstrumentQueueAddsCount.Name()], } } @@ -237,7 +183,7 @@ func (m *Metrics) NewLatencyMetric(name string) workqueue.HistogramMetric { return queueMetric{ ctx: m.Ctx, name: name, - inst: m.AllInstruments[nameWorkersQueueLatency], + inst: m.AllInstruments[telemetry.InstrumentQueueLatency.Name()], } } @@ -245,7 +191,7 @@ func (m *Metrics) NewWorkDurationMetric(name string) workqueue.HistogramMetric { return queueMetric{ ctx: m.Ctx, name: name, - inst: m.AllInstruments[nameWorkersQueueDuration], + inst: m.AllInstruments[telemetry.InstrumentQueueDuration.Name()], } } @@ -253,7 +199,7 @@ func (m *Metrics) NewRetriesMetric(name string) workqueue.CounterMetric { return queueMetric{ ctx: m.Ctx, name: name, - inst: m.AllInstruments[nameWorkersRetries], + inst: m.AllInstruments[telemetry.InstrumentQueueRetries.Name()], } } @@ -261,7 +207,7 @@ func (m *Metrics) NewUnfinishedWorkSecondsMetric(name string) workqueue.Settable metric := queueMetric{ ctx: m.Ctx, name: name, - inst: m.AllInstruments[nameWorkersUnfinishedWork], + inst: m.AllInstruments[telemetry.InstrumentQueueUnfinishedWork.Name()], value: ptr.To(float64(0.0)), } ud := getQueueUserdata(metric.inst) @@ -273,7 +219,7 @@ func (m *Metrics) NewLongestRunningProcessorSecondsMetric(name string) workqueue metric := queueMetric{ ctx: m.Ctx, name: name, - inst: m.AllInstruments[nameWorkersLongestRunning], + inst: m.AllInstruments[telemetry.InstrumentQueueLongestRunning.Name()], value: ptr.To(float64(0.0)), } ud := getQueueUserdata(metric.inst) diff --git a/workflow/metrics/work_queue_test.go b/workflow/metrics/work_queue_test.go index 0222a292e23e..15e9f851ea4c 100644 --- a/workflow/metrics/work_queue_test.go +++ b/workflow/metrics/work_queue_test.go @@ -19,30 +19,30 @@ func TestMetricsWorkQueue(t *testing.T) { queue := m.RateLimiterWithBusyWorkers(m.Ctx, workqueue.DefaultTypedControllerRateLimiter[string](), "test") defer queue.ShutDown() - val, err := te.GetInt64CounterValue(nameWorkersBusy, &attribsWT) + val, err := te.GetInt64CounterValue(telemetry.InstrumentWorkersBusyCount.Name(), &attribsWT) require.NoError(t, err) assert.Equal(t, int64(0), val) attribsQN := attribute.NewSet(attribute.String(telemetry.AttribQueueName, "test")) queue.Add("A") - val, err = te.GetInt64CounterValue(nameWorkersBusy, &attribsWT) + val, err = te.GetInt64CounterValue(telemetry.InstrumentWorkersBusyCount.Name(), &attribsWT) require.NoError(t, err) assert.Equal(t, int64(0), val) - val, err = te.GetInt64CounterValue(nameWorkersQueueDepth, &attribsQN) + val, err = te.GetInt64CounterValue(telemetry.InstrumentQueueDepthGauge.Name(), &attribsQN) require.NoError(t, err) assert.Equal(t, int64(1), val) queue.Get() - val, err = te.GetInt64CounterValue(nameWorkersBusy, &attribsWT) + val, err = te.GetInt64CounterValue(telemetry.InstrumentWorkersBusyCount.Name(), &attribsWT) require.NoError(t, err) assert.Equal(t, int64(1), val) - val, err = te.GetInt64CounterValue(nameWorkersQueueDepth, &attribsQN) + val, err = te.GetInt64CounterValue(telemetry.InstrumentQueueDepthGauge.Name(), &attribsQN) require.NoError(t, err) assert.Equal(t, int64(0), val) queue.Done("A") - val, err = te.GetInt64CounterValue(nameWorkersBusy, &attribsWT) + val, err = te.GetInt64CounterValue(telemetry.InstrumentWorkersBusyCount.Name(), &attribsWT) require.NoError(t, err) assert.Equal(t, int64(0), val) }