From a08cc7c376d3a09982c12ae6db0b83d96c106a07 Mon Sep 17 00:00:00 2001 From: Bernard Grymonpon Date: Thu, 9 Nov 2023 17:14:38 +0100 Subject: [PATCH 01/16] Reworks the prometheus metrics to adhere to best practices Signed-off-by: Bernard Grymonpon Signed-off-by: Bernard Grymonpon --- CHANGELOG.md | 2 + pkg/metricscollector/metricscollectors.go | 10 +- pkg/metricscollector/opentelemetry.go | 9 +- pkg/metricscollector/prommetrics.go | 142 +++++++++++++++++----- pkg/scaling/cache/scalers_cache.go | 6 +- pkg/scaling/scale_handler.go | 10 +- 6 files changed, 130 insertions(+), 49 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d230956e6a4..31c898fe2d3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -67,6 +67,7 @@ Here is an overview of all new **experimental** features: - **Kafka Scaler**: Add support for Kerberos authentication (SASL / GSSAPI) ([#4836](https://github.com/kedacore/keda/issues/4836)) - **Prometheus Metrics**: Introduce paused ScaledObjects in Prometheus metrics ([#4430](https://github.com/kedacore/keda/issues/4430)) - **Pulsar Scaler**: support endpointParams in pulsar oauth ([#5069](https://github.com/kedacore/keda/issues/5069)) +- **General**: Renamed Prometheus metrics to include units and `total` where approriate ([#4854](https://github.com/kedacore/keda/issues/4854)) ### Fixes @@ -82,6 +83,7 @@ You can find all deprecations in [this overview](https://github.com/kedacore/ked New deprecation(s): - Remove support for Azure AD Pod Identity-based authentication ([#5035](https://github.com/kedacore/keda/issues/5035)) +- Various Prometheus metrics have been renamed to follow the preferred naming conventions. The old ones are still available, but will be removed in the future ([#4854](https://github.com/kedacore/keda/issues/4854)). ### Breaking Changes diff --git a/pkg/metricscollector/metricscollectors.go b/pkg/metricscollector/metricscollectors.go index d367028e3b3..45d458ebe75 100644 --- a/pkg/metricscollector/metricscollectors.go +++ b/pkg/metricscollector/metricscollectors.go @@ -16,6 +16,8 @@ limitations under the License. package metricscollector +import "time" + const ( ClusterTriggerAuthenticationResource = "cluster_trigger_authentication" TriggerAuthenticationResource = "trigger_authentication" @@ -33,10 +35,10 @@ type MetricsCollector interface { RecordScalerMetric(namespace string, scaledObject string, scaler string, scalerIndex int, metric string, value float64) // RecordScalerLatency create a measurement of the latency to external metric - RecordScalerLatency(namespace string, scaledObject string, scaler string, scalerIndex int, metric string, value float64) + RecordScalerLatency(namespace string, scaledObject string, scaler string, scalerIndex int, metric string, value time.Duration) // RecordScalableObjectLatency create a measurement of the latency executing scalable object loop - RecordScalableObjectLatency(namespace string, name string, isScaledObject bool, value float64) + RecordScalableObjectLatency(namespace string, name string, isScaledObject bool, value time.Duration) // RecordScalerActive create a measurement of the activity of the scaler RecordScalerActive(namespace string, scaledObject string, scaler string, scalerIndex int, metric string, active bool) @@ -79,14 +81,14 @@ func RecordScalerMetric(namespace string, scaledObject string, scaler string, sc } // RecordScalerLatency create a measurement of the latency to external metric -func RecordScalerLatency(namespace string, scaledObject string, scaler string, scalerIndex int, metric string, value float64) { +func RecordScalerLatency(namespace string, scaledObject string, scaler string, scalerIndex int, metric string, value time.Duration) { for _, element := range collectors { element.RecordScalerLatency(namespace, scaledObject, scaler, scalerIndex, metric, value) } } // RecordScalableObjectLatency create a measurement of the latency executing scalable object loop -func RecordScalableObjectLatency(namespace string, name string, isScaledObject bool, value float64) { +func RecordScalableObjectLatency(namespace string, name string, isScaledObject bool, value time.Duration) { for _, element := range collectors { element.RecordScalableObjectLatency(namespace, name, isScaledObject, value) } diff --git a/pkg/metricscollector/opentelemetry.go b/pkg/metricscollector/opentelemetry.go index 64211e3f3e4..2e5ac4a9071 100644 --- a/pkg/metricscollector/opentelemetry.go +++ b/pkg/metricscollector/opentelemetry.go @@ -5,6 +5,7 @@ import ( "fmt" "runtime" "strconv" + "time" "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/attribute" @@ -185,8 +186,8 @@ func ScalerMetricsLatencyCallback(_ context.Context, obsrv api.Float64Observer) } // RecordScalerLatency create a measurement of the latency to external metric -func (o *OtelMetrics) RecordScalerLatency(namespace string, scaledObject string, scaler string, scalerIndex int, metric string, value float64) { - otelScalerMetricsLatencyVal.val = value +func (o *OtelMetrics) RecordScalerLatency(namespace string, scaledObject string, scaler string, scalerIndex int, metric string, value time.Duration) { + otelScalerMetricsLatencyVal.val = value.Seconds() otelScalerMetricsLatencyVal.measurementOption = getScalerMeasurementOption(namespace, scaledObject, scaler, scalerIndex, metric) } @@ -199,7 +200,7 @@ func ScalableObjectLatencyCallback(_ context.Context, obsrv api.Float64Observer) } // RecordScalableObjectLatency create a measurement of the latency executing scalable object loop -func (o *OtelMetrics) RecordScalableObjectLatency(namespace string, name string, isScaledObject bool, value float64) { +func (o *OtelMetrics) RecordScalableObjectLatency(namespace string, name string, isScaledObject bool, value time.Duration) { resourceType := "scaledjob" if isScaledObject { resourceType = "scaledobject" @@ -210,7 +211,7 @@ func (o *OtelMetrics) RecordScalableObjectLatency(namespace string, name string, attribute.Key("type").String(resourceType), attribute.Key("name").String(name)) - otelInternalLoopLatencyVal.val = value + otelInternalLoopLatencyVal.val = value.Seconds() otelInternalLoopLatencyVal.measurementOption = opt } diff --git a/pkg/metricscollector/prommetrics.go b/pkg/metricscollector/prommetrics.go index 5d318d378ae..6c7f953a244 100644 --- a/pkg/metricscollector/prommetrics.go +++ b/pkg/metricscollector/prommetrics.go @@ -19,6 +19,7 @@ package metricscollector import ( "runtime" "strconv" + "time" "github.com/prometheus/client_golang/prometheus" logf "sigs.k8s.io/controller-runtime/pkg/log" @@ -39,12 +40,12 @@ var ( }, []string{"version", "git_commit", "goversion", "goos", "goarch"}, ) - scalerErrorsTotal = prometheus.NewCounterVec( + scalerErrorsTotalDeprecated = prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: DefaultPromMetricsNamespace, Subsystem: "scaler", Name: "errors_total", - Help: "Total number of errors for all scalers", + Help: "DEPRECATED - use a `sum(scaler_errors_total{scaler!=\"\"})` over all scalers", }, []string{}, ) @@ -53,16 +54,25 @@ var ( Namespace: DefaultPromMetricsNamespace, Subsystem: "scaler", Name: "metrics_value", - Help: "Metric Value used for HPA", + Help: "Current value of the metric obtained from the scaler that the Horizontal Pod Autoscaler (HPA) uses to make scaling decisions.", }, metricLabels, ) - scalerMetricsLatency = prometheus.NewGaugeVec( + scalerMetricsLatencyDeprecated = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: DefaultPromMetricsNamespace, Subsystem: "scaler", Name: "metrics_latency", - Help: "Scaler Metrics Latency", + Help: "DEPRECATED - use 'scaler_metrics_latency_seconds' instead.", + }, + metricLabels, + ) + scalerMetricsLatency = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: DefaultPromMetricsNamespace, + Subsystem: "scaler", + Name: "metrics_latency_seconds", + Help: "Latency observed by a scaler in getting the metric from the source, in seconds.", }, metricLabels, ) @@ -71,7 +81,7 @@ var ( Namespace: DefaultPromMetricsNamespace, Subsystem: "scaler", Name: "active", - Help: "Activity of a Scaler Metric", + Help: "Indicates whether a scaler is active (1), or not (0).", }, metricLabels, ) @@ -80,53 +90,97 @@ var ( Namespace: DefaultPromMetricsNamespace, Subsystem: "scaled_object", Name: "paused", - Help: "Indicates whether a ScaledObject is paused", + Help: "Indicates whether a ScaledObject is paused (1), or not (0).", }, []string{"namespace", "scaledObject"}, ) - scalerErrors = prometheus.NewCounterVec( + scalerErrorsDeprecated = prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: DefaultPromMetricsNamespace, Subsystem: "scaler", Name: "errors", - Help: "Number of scaler errors", + Help: "DEPRECATED - use 'scaler_errors_total' instead.", }, metricLabels, ) - scaledObjectErrors = prometheus.NewCounterVec( + scalerErrors = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: DefaultPromMetricsNamespace, + Subsystem: "scaler", + Name: "errors_total", + Help: "Total number of errors observed by a scaler.", + }, + metricLabels, + ) + scaledObjectErrorsDeprecated = prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: DefaultPromMetricsNamespace, Subsystem: "scaled_object", Name: "errors", - Help: "Number of scaled object errors", + Help: "DEPRECATED - use 'scaled_object_errors_total' instead.", }, []string{"namespace", "scaledObject"}, ) - - triggerTotalsGaugeVec = prometheus.NewGaugeVec( + scaledObjectErrors = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: DefaultPromMetricsNamespace, + Subsystem: "scaled_object", + Name: "errors_total", + Help: "Total number of errors observed by a scaled object.", + }, + []string{"namespace", "scaledObject"}, + ) + triggerTotalsGaugeVecDeprecated = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: DefaultPromMetricsNamespace, Subsystem: "trigger", Name: "totals", + Help: "DEPRECATED - use 'trigger_handled_total' instead.", }, []string{"type"}, ) - - crdTotalsGaugeVec = prometheus.NewGaugeVec( + triggerHandled = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: DefaultPromMetricsNamespace, + Subsystem: "trigger", + Name: "handled_total", + Help: "Total number of triggers currently handled.", + }, + []string{"type"}, + ) + crdTotalsGaugeVecDeprecated = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: DefaultPromMetricsNamespace, Subsystem: "resource", Name: "totals", + Help: "DEPRECATED - use 'resource_handled_total' instead.", }, []string{"type", "namespace"}, ) - - internalLoopLatency = prometheus.NewGaugeVec( + crdHandled = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: DefaultPromMetricsNamespace, + Subsystem: "resource", + Name: "handled_total", + Help: "Total number of ScaledObjects/ScaledJobs currently handled.", + }, + []string{"type", "namespace"}, + ) + internalLoopLatencyDeprecated = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: DefaultPromMetricsNamespace, Subsystem: "internal_scale_loop", Name: "latency", - Help: "Internal latency of ScaledObject/ScaledJob loop execution", + Help: "DEPRECATED - use 'internal_scale_loop_latency_seconds' instead.", + }, + []string{"namespace", "type", "resource"}, + ) + internalLoopLatency = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: DefaultPromMetricsNamespace, + Subsystem: "internal_scale_loop", + Name: "latency_seconds", + Help: "Internal latency of ScaledObject/ScaledJob loop execution in seconds.", }, []string{"namespace", "type", "resource"}, ) @@ -136,17 +190,22 @@ type PromMetrics struct { } func NewPromMetrics() *PromMetrics { - metrics.Registry.MustRegister(scalerErrorsTotal) + metrics.Registry.MustRegister(scalerErrorsTotalDeprecated) metrics.Registry.MustRegister(scalerMetricsValue) + metrics.Registry.MustRegister(scalerMetricsLatencyDeprecated) metrics.Registry.MustRegister(scalerMetricsLatency) + metrics.Registry.MustRegister(internalLoopLatencyDeprecated) metrics.Registry.MustRegister(internalLoopLatency) metrics.Registry.MustRegister(scalerActive) + metrics.Registry.MustRegister(scalerErrorsDeprecated) metrics.Registry.MustRegister(scalerErrors) + metrics.Registry.MustRegister(scaledObjectErrorsDeprecated) metrics.Registry.MustRegister(scaledObjectErrors) metrics.Registry.MustRegister(scaledObjectPaused) - - metrics.Registry.MustRegister(triggerTotalsGaugeVec) - metrics.Registry.MustRegister(crdTotalsGaugeVec) + metrics.Registry.MustRegister(triggerTotalsGaugeVecDeprecated) + metrics.Registry.MustRegister(triggerHandled) + metrics.Registry.MustRegister(crdTotalsGaugeVecDeprecated) + metrics.Registry.MustRegister(crdHandled) metrics.Registry.MustRegister(buildInfo) RecordBuildInfo() @@ -164,17 +223,19 @@ func (p *PromMetrics) RecordScalerMetric(namespace string, scaledObject string, } // RecordScalerLatency create a measurement of the latency to external metric -func (p *PromMetrics) RecordScalerLatency(namespace string, scaledObject string, scaler string, scalerIndex int, metric string, value float64) { - scalerMetricsLatency.With(getLabels(namespace, scaledObject, scaler, scalerIndex, metric)).Set(value) +func (p *PromMetrics) RecordScalerLatency(namespace string, scaledObject string, scaler string, scalerIndex int, metric string, value time.Duration) { + scalerMetricsLatency.With(getLabels(namespace, scaledObject, scaler, scalerIndex, metric)).Set(value.Seconds()) + scalerMetricsLatencyDeprecated.With(getLabels(namespace, scaledObject, scaler, scalerIndex, metric)).Set(float64(value.Milliseconds())) } // RecordScalableObjectLatency create a measurement of the latency executing scalable object loop -func (p *PromMetrics) RecordScalableObjectLatency(namespace string, name string, isScaledObject bool, value float64) { +func (p *PromMetrics) RecordScalableObjectLatency(namespace string, name string, isScaledObject bool, value time.Duration) { resourceType := "scaledjob" if isScaledObject { resourceType = "scaledobject" } - internalLoopLatency.WithLabelValues(namespace, resourceType, name).Set(value) + internalLoopLatency.WithLabelValues(namespace, resourceType, name).Set(value.Seconds()) + internalLoopLatencyDeprecated.WithLabelValues(namespace, resourceType, name).Set(float64(value.Milliseconds())) } // RecordScalerActive create a measurement of the activity of the scaler @@ -203,14 +264,19 @@ func (p *PromMetrics) RecordScaledObjectPaused(namespace string, scaledObject st func (p *PromMetrics) RecordScalerError(namespace string, scaledObject string, scaler string, scalerIndex int, metric string, err error) { if err != nil { scalerErrors.With(getLabels(namespace, scaledObject, scaler, scalerIndex, metric)).Inc() + scalerErrorsDeprecated.With(getLabels(namespace, scaledObject, scaler, scalerIndex, metric)).Inc() p.RecordScaledObjectError(namespace, scaledObject, err) - scalerErrorsTotal.With(prometheus.Labels{}).Inc() + scalerErrorsTotalDeprecated.With(prometheus.Labels{}).Inc() return } // initialize metric with 0 if not already set _, errscaler := scalerErrors.GetMetricWith(getLabels(namespace, scaledObject, scaler, scalerIndex, metric)) if errscaler != nil { - log.Error(errscaler, "Unable to write to metrics to Prometheus Server: %v") + log.Error(errscaler, "Unable to record metrics: %v") + } + _, errscalerdep := scalerErrorsDeprecated.GetMetricWith(getLabels(namespace, scaledObject, scaler, scalerIndex, metric)) + if errscalerdep != nil { + log.Error(errscaler, "Unable to record (deprecated) metrics: %v") } } @@ -219,12 +285,18 @@ func (p *PromMetrics) RecordScaledObjectError(namespace string, scaledObject str labels := prometheus.Labels{"namespace": namespace, "scaledObject": scaledObject} if err != nil { scaledObjectErrors.With(labels).Inc() + scaledObjectErrorsDeprecated.With(labels).Inc() return } // initialize metric with 0 if not already set _, errscaledobject := scaledObjectErrors.GetMetricWith(labels) if errscaledobject != nil { - log.Error(errscaledobject, "Unable to write to metrics to Prometheus Server: %v") + log.Error(errscaledobject, "Unable to record metrics: %v") + return + } + _, errscaledobjectdep := scaledObjectErrorsDeprecated.GetMetricWith(labels) + if errscaledobjectdep != nil { + log.Error(errscaledobject, "Unable to record metrics: %v") return } } @@ -235,13 +307,15 @@ func getLabels(namespace string, scaledObject string, scaler string, scalerIndex func (p *PromMetrics) IncrementTriggerTotal(triggerType string) { if triggerType != "" { - triggerTotalsGaugeVec.WithLabelValues(triggerType).Inc() + triggerHandled.WithLabelValues(triggerType).Inc() + triggerTotalsGaugeVecDeprecated.WithLabelValues(triggerType).Inc() } } func (p *PromMetrics) DecrementTriggerTotal(triggerType string) { if triggerType != "" { - triggerTotalsGaugeVec.WithLabelValues(triggerType).Dec() + triggerHandled.WithLabelValues(triggerType).Dec() + triggerTotalsGaugeVecDeprecated.WithLabelValues(triggerType).Dec() } } @@ -250,7 +324,8 @@ func (p *PromMetrics) IncrementCRDTotal(crdType, namespace string) { namespace = defaultNamespace } - crdTotalsGaugeVec.WithLabelValues(crdType, namespace).Inc() + crdHandled.WithLabelValues(crdType, namespace).Inc() + crdTotalsGaugeVecDeprecated.WithLabelValues(crdType, namespace).Inc() } func (p *PromMetrics) DecrementCRDTotal(crdType, namespace string) { @@ -258,5 +333,6 @@ func (p *PromMetrics) DecrementCRDTotal(crdType, namespace string) { namespace = defaultNamespace } - crdTotalsGaugeVec.WithLabelValues(crdType, namespace).Dec() + crdHandled.WithLabelValues(crdType, namespace).Dec() + crdTotalsGaugeVecDeprecated.WithLabelValues(crdType, namespace).Dec() } diff --git a/pkg/scaling/cache/scalers_cache.go b/pkg/scaling/cache/scalers_cache.go index 4cb16eebeb0..ee1288ab7d0 100644 --- a/pkg/scaling/cache/scalers_cache.go +++ b/pkg/scaling/cache/scalers_cache.go @@ -121,14 +121,14 @@ func (c *ScalersCache) GetMetricSpecForScalingForScaler(ctx context.Context, ind // GetMetricsAndActivityForScaler returns metric value, activity and latency for a scaler identified by the metric name // and by the input index (from the list of scalers in this ScaledObject) -func (c *ScalersCache) GetMetricsAndActivityForScaler(ctx context.Context, index int, metricName string) ([]external_metrics.ExternalMetricValue, bool, int64, error) { +func (c *ScalersCache) GetMetricsAndActivityForScaler(ctx context.Context, index int, metricName string) ([]external_metrics.ExternalMetricValue, bool, time.Duration, error) { if index < 0 || index >= len(c.Scalers) { return nil, false, -1, fmt.Errorf("scaler with id %d not found. Len = %d", index, len(c.Scalers)) } startTime := time.Now() metric, activity, err := c.Scalers[index].Scaler.GetMetricsAndActivity(ctx, metricName) if err == nil { - return metric, activity, time.Since(startTime).Milliseconds(), nil + return metric, activity, time.Since(startTime), nil } ns, err := c.refreshScaler(ctx, index) @@ -137,7 +137,7 @@ func (c *ScalersCache) GetMetricsAndActivityForScaler(ctx context.Context, index } startTime = time.Now() metric, activity, err = ns.GetMetricsAndActivity(ctx, metricName) - return metric, activity, time.Since(startTime).Milliseconds(), err + return metric, activity, time.Since(startTime), err } func (c *ScalersCache) refreshScaler(ctx context.Context, id int) (scalers.Scaler, error) { diff --git a/pkg/scaling/scale_handler.go b/pkg/scaling/scale_handler.go index 5e7dc3ddc7f..8f7a4b6bd82 100644 --- a/pkg/scaling/scale_handler.go +++ b/pkg/scaling/scale_handler.go @@ -172,7 +172,7 @@ func (h *scaleHandler) startScaleLoop(ctx context.Context, withTriggers *kedav1a // we calculate the next execution time based on the pollingInterval and record the difference // between the expected execution time and the real execution time delay := time.Since(next) - metricscollector.RecordScalableObjectLatency(withTriggers.Namespace, withTriggers.Name, isScaledObject, float64(delay.Milliseconds())) + metricscollector.RecordScalableObjectLatency(withTriggers.Namespace, withTriggers.Name, isScaledObject, delay) tmr := time.NewTimer(pollingInterval) next = time.Now().Add(pollingInterval) @@ -504,10 +504,10 @@ func (h *scaleHandler) GetScaledObjectMetrics(ctx context.Context, scaledObjectN } if !metricsFoundInCache { - var latency int64 + var latency time.Duration metrics, _, latency, err = cache.GetMetricsAndActivityForScaler(ctx, scalerIndex, metricName) if latency != -1 { - metricscollector.RecordScalerLatency(scaledObjectNamespace, scaledObject.Name, scalerName, scalerIndex, metricName, float64(latency)) + metricscollector.RecordScalerLatency(scaledObjectNamespace, scaledObject.Name, scalerName, scalerIndex, metricName, latency) } logger.V(1).Info("Getting metrics from scaler", "scaler", scalerName, "metricName", spec.External.Metric.Name, "metrics", metrics, "scalerError", err) } @@ -695,10 +695,10 @@ func (*scaleHandler) getScalerState(ctx context.Context, scaler scalers.Scaler, metricName := spec.External.Metric.Name - var latency int64 + var latency time.Duration metrics, isMetricActive, latency, err := cache.GetMetricsAndActivityForScaler(ctx, scalerIndex, metricName) if latency != -1 { - metricscollector.RecordScalerLatency(scaledObject.Namespace, scaledObject.Name, scalerName, scalerIndex, metricName, float64(latency)) + metricscollector.RecordScalerLatency(scaledObject.Namespace, scaledObject.Name, scalerName, scalerIndex, metricName, latency) } result.Metrics = append(result.Metrics, metrics...) logger.V(1).Info("Getting metrics and activity from scaler", "scaler", scalerName, "metricName", metricName, "metrics", metrics, "activity", isMetricActive, "scalerError", err) From 4521af3a16670220b28be56c7ef12c420aa92805 Mon Sep 17 00:00:00 2001 From: Bernard Grymonpon Date: Thu, 9 Nov 2023 22:20:51 +0100 Subject: [PATCH 02/16] Updates help info to align with the public docs Signed-off-by: Bernard Grymonpon --- pkg/metricscollector/prommetrics.go | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pkg/metricscollector/prommetrics.go b/pkg/metricscollector/prommetrics.go index 6c7f953a244..538a6130db7 100644 --- a/pkg/metricscollector/prommetrics.go +++ b/pkg/metricscollector/prommetrics.go @@ -36,7 +36,7 @@ var ( prometheus.GaugeOpts{ Namespace: DefaultPromMetricsNamespace, Name: "build_info", - Help: "A metric with a constant '1' value labeled by version, git_commit and goversion from which KEDA was built.", + Help: "Info metric, with static information about KEDA build like: version, git commit and Golang runtime info.", }, []string{"version", "git_commit", "goversion", "goos", "goarch"}, ) @@ -54,7 +54,7 @@ var ( Namespace: DefaultPromMetricsNamespace, Subsystem: "scaler", Name: "metrics_value", - Help: "Current value of the metric obtained from the scaler that the Horizontal Pod Autoscaler (HPA) uses to make scaling decisions.", + Help: "The current value for each scaler's metric that would be used by the HPA in computing the target average.", }, metricLabels, ) @@ -72,7 +72,7 @@ var ( Namespace: DefaultPromMetricsNamespace, Subsystem: "scaler", Name: "metrics_latency_seconds", - Help: "Latency observed by a scaler in getting the metric from the source, in seconds.", + Help: "The latency of retrieving current metric from each scaler, in seconds.", }, metricLabels, ) @@ -108,7 +108,7 @@ var ( Namespace: DefaultPromMetricsNamespace, Subsystem: "scaler", Name: "errors_total", - Help: "Total number of errors observed by a scaler.", + Help: "The total number of errors encountered for each scaler.", }, metricLabels, ) @@ -126,7 +126,7 @@ var ( Namespace: DefaultPromMetricsNamespace, Subsystem: "scaled_object", Name: "errors_total", - Help: "Total number of errors observed by a scaled object.", + Help: "The number of errors that have occurred for each ScaledObject.", }, []string{"namespace", "scaledObject"}, ) @@ -144,7 +144,7 @@ var ( Namespace: DefaultPromMetricsNamespace, Subsystem: "trigger", Name: "handled_total", - Help: "Total number of triggers currently handled.", + Help: "Total number of triggers per trigger type handled.", }, []string{"type"}, ) @@ -162,7 +162,7 @@ var ( Namespace: DefaultPromMetricsNamespace, Subsystem: "resource", Name: "handled_total", - Help: "Total number of ScaledObjects/ScaledJobs currently handled.", + Help: "Total number of KEDA custom resources per namespace for each custom resource type (CRD) handled.", }, []string{"type", "namespace"}, ) @@ -180,7 +180,7 @@ var ( Namespace: DefaultPromMetricsNamespace, Subsystem: "internal_scale_loop", Name: "latency_seconds", - Help: "Internal latency of ScaledObject/ScaledJob loop execution in seconds.", + Help: "Total deviation (in seconds) between the expected execution time and the actual execution time for the scaling loop.", }, []string{"namespace", "type", "resource"}, ) From a0e5ab86dc9a3d68bddfedec84a1a8d9b82ceede Mon Sep 17 00:00:00 2001 From: Bernard Grymonpon Date: Thu, 9 Nov 2023 22:29:08 +0100 Subject: [PATCH 03/16] Updates grafana dashboard to use the new metrics Signed-off-by: Bernard Grymonpon --- config/grafana/keda-dashboard.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/config/grafana/keda-dashboard.json b/config/grafana/keda-dashboard.json index 07abaec730c..4648e636324 100644 --- a/config/grafana/keda-dashboard.json +++ b/config/grafana/keda-dashboard.json @@ -173,7 +173,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum by(job) (rate(keda_scaler_errors{}[5m]))", + "expr": "sum by(job) (rate(keda_scaler_errors_total{}[5m]))", "legendFormat": "{{ job }}", "range": true, "refId": "A" @@ -313,7 +313,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum by(scaler) (rate(keda_scaler_errors{exported_namespace=~\"$namespace\", scaledObject=~\"$scaledObject\", scaler=~\"$scaler\"}[5m]))", + "expr": "sum by(scaler) (rate(keda_scaler_errors_total{exported_namespace=~\"$namespace\", scaledObject=~\"$scaledObject\", scaler=~\"$scaler\"}[5m]))", "legendFormat": "{{ scaler }}", "range": true, "refId": "A" @@ -423,7 +423,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum by(scaledObject) (rate(keda_scaled_object_errors{exported_namespace=~\"$namespace\", scaledObject=~\"$scaledObject\"}[5m]))", + "expr": "sum by(scaledObject) (rate(keda_scaled_object_errors_total{exported_namespace=~\"$namespace\", scaledObject=~\"$scaledObject\"}[5m]))", "legendFormat": "{{ scaledObject }}", "range": true, "refId": "A" @@ -997,4 +997,4 @@ "uid": "asdasd8rvmMxdVk", "version": 8, "weekStart": "" -} +} \ No newline at end of file From 571419b13576cf2b92d104b510596d758cb4721b Mon Sep 17 00:00:00 2001 From: Bernard Grymonpon Date: Wed, 22 Nov 2023 09:45:35 +0100 Subject: [PATCH 04/16] resolved review comments Signed-off-by: Bernard Grymonpon --- CHANGELOG.md | 2 +- pkg/metricscollector/prommetrics.go | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 31c898fe2d3..edc7ab4ecb6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -61,13 +61,13 @@ Here is an overview of all new **experimental** features: ### Improvements - **General**: Add parameter queryParameters to prometheus-scaler ([#4962](https://github.com/kedacore/keda/issues/4962)) +- **General**: Renamed Prometheus metrics to include units and `total` where approriate ([#4854](https://github.com/kedacore/keda/issues/4854)) - **General**: Support TriggerAuthentication properties from ConfigMap ([#4830](https://github.com/kedacore/keda/issues/4830)) - **Hashicorp Vault**: Add support to get secret that needs write operation (e.g. pki) ([#5067](https://github.com/kedacore/keda/issues/5067)) - **Kafka Scaler**: Ability to set upper bound to the number of partitions with lag ([#3997](https://github.com/kedacore/keda/issues/3997)) - **Kafka Scaler**: Add support for Kerberos authentication (SASL / GSSAPI) ([#4836](https://github.com/kedacore/keda/issues/4836)) - **Prometheus Metrics**: Introduce paused ScaledObjects in Prometheus metrics ([#4430](https://github.com/kedacore/keda/issues/4430)) - **Pulsar Scaler**: support endpointParams in pulsar oauth ([#5069](https://github.com/kedacore/keda/issues/5069)) -- **General**: Renamed Prometheus metrics to include units and `total` where approriate ([#4854](https://github.com/kedacore/keda/issues/4854)) ### Fixes diff --git a/pkg/metricscollector/prommetrics.go b/pkg/metricscollector/prommetrics.go index 538a6130db7..046c8969058 100644 --- a/pkg/metricscollector/prommetrics.go +++ b/pkg/metricscollector/prommetrics.go @@ -45,7 +45,7 @@ var ( Namespace: DefaultPromMetricsNamespace, Subsystem: "scaler", Name: "errors_total", - Help: "DEPRECATED - use a `sum(scaler_errors_total{scaler!=\"\"})` over all scalers", + Help: "DEPRECATED - will be removed in 2.15 - use a `sum(scaler_errors_total{scaler!=\"\"})` over all scalers", }, []string{}, ) @@ -63,7 +63,7 @@ var ( Namespace: DefaultPromMetricsNamespace, Subsystem: "scaler", Name: "metrics_latency", - Help: "DEPRECATED - use 'scaler_metrics_latency_seconds' instead.", + Help: "DEPRECATED - will be removed in 2.15 use 'scaler_metrics_latency_seconds' instead.", }, metricLabels, ) @@ -99,7 +99,7 @@ var ( Namespace: DefaultPromMetricsNamespace, Subsystem: "scaler", Name: "errors", - Help: "DEPRECATED - use 'scaler_errors_total' instead.", + Help: "DEPRECATED - will be removed in 2.15 - use 'scaler_errors_total' instead.", }, metricLabels, ) @@ -117,7 +117,7 @@ var ( Namespace: DefaultPromMetricsNamespace, Subsystem: "scaled_object", Name: "errors", - Help: "DEPRECATED - use 'scaled_object_errors_total' instead.", + Help: "DEPRECATED - will be removed in 2.15 - use 'scaled_object_errors_total' instead.", }, []string{"namespace", "scaledObject"}, ) @@ -135,7 +135,7 @@ var ( Namespace: DefaultPromMetricsNamespace, Subsystem: "trigger", Name: "totals", - Help: "DEPRECATED - use 'trigger_handled_total' instead.", + Help: "DEPRECATED - will be removed in 2.15 - use 'trigger_handled_total' instead.", }, []string{"type"}, ) @@ -153,7 +153,7 @@ var ( Namespace: DefaultPromMetricsNamespace, Subsystem: "resource", Name: "totals", - Help: "DEPRECATED - use 'resource_handled_total' instead.", + Help: "DEPRECATED - will be removed in 2.15 - use 'resource_handled_total' instead.", }, []string{"type", "namespace"}, ) @@ -171,7 +171,7 @@ var ( Namespace: DefaultPromMetricsNamespace, Subsystem: "internal_scale_loop", Name: "latency", - Help: "DEPRECATED - use 'internal_scale_loop_latency_seconds' instead.", + Help: "DEPRECATED - will be removed in 2.15 - use 'internal_scale_loop_latency_seconds' instead.", }, []string{"namespace", "type", "resource"}, ) From c557a34723d63cf895104b90bfc6e434a093fbf9 Mon Sep 17 00:00:00 2001 From: Bernard Grymonpon Date: Wed, 22 Nov 2023 16:04:09 +0100 Subject: [PATCH 05/16] Updates webhook metrics and e2e tests Signed-off-by: Bernard Grymonpon --- .../webhook/webhook_prommetrics.go | 22 ++++++ .../prometheus_metrics_test.go | 77 ++++++------------- 2 files changed, 46 insertions(+), 53 deletions(-) diff --git a/pkg/metricscollector/webhook/webhook_prommetrics.go b/pkg/metricscollector/webhook/webhook_prommetrics.go index c7ba4df49c0..15c00b4f7f7 100644 --- a/pkg/metricscollector/webhook/webhook_prommetrics.go +++ b/pkg/metricscollector/webhook/webhook_prommetrics.go @@ -31,6 +31,15 @@ var ( Namespace: DefaultPromMetricsNamespace, Subsystem: "webhook", Name: "scaled_object_validation_total", + Help: "DEPRECATED - will be removed in 2.15 - Use `scaled_object_validations_total` instead.", + }, + []string{"namespace", "action"}, + ) + scaledObjectValidationsTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: DefaultPromMetricsNamespace, + Subsystem: "webhook", + Name: "scaled_object_validations_total", Help: "Total number of scaled object validations", }, []string{"namespace", "action"}, @@ -40,6 +49,15 @@ var ( Namespace: DefaultPromMetricsNamespace, Subsystem: "webhook", Name: "scaled_object_validation_errors", + Help: "DEPRECATED - will be removed in 2.15 - Use `scaled_object_validation_errors_total` instead.", + }, + []string{"namespace", "action", "reason"}, + ) + scaledObjectValidationErrorsTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: DefaultPromMetricsNamespace, + Subsystem: "webhook", + Name: "scaled_object_validation_errors_total", Help: "Total number of scaled object validating errors", }, []string{"namespace", "action", "reason"}, @@ -48,17 +66,21 @@ var ( func init() { metrics.Registry.MustRegister(scaledObjectValidatingTotal) + metrics.Registry.MustRegister(scaledObjectValidationsTotal) metrics.Registry.MustRegister(scaledObjectValidatingErrors) + metrics.Registry.MustRegister(scaledObjectValidationErrorsTotal) } // RecordScaledObjectValidatingTotal counts the number of ScaledObject validations func RecordScaledObjectValidatingTotal(namespace, action string) { labels := prometheus.Labels{"namespace": namespace, "action": action} scaledObjectValidatingTotal.With(labels).Inc() + scaledObjectValidationsTotal.With(labels).Inc() } // RecordScaledObjectValidatingErrors counts the number of ScaledObject validating errors func RecordScaledObjectValidatingErrors(namespace, action, reason string) { labels := prometheus.Labels{"namespace": namespace, "action": action, "reason": reason} scaledObjectValidatingErrors.With(labels).Inc() + scaledObjectValidationErrorsTotal.With(labels).Inc() } diff --git a/tests/sequential/prometheus_metrics/prometheus_metrics_test.go b/tests/sequential/prometheus_metrics/prometheus_metrics_test.go index da6df210f71..88977d30262 100644 --- a/tests/sequential/prometheus_metrics/prometheus_metrics_test.go +++ b/tests/sequential/prometheus_metrics/prometheus_metrics_test.go @@ -350,7 +350,7 @@ func testScalerMetricValue(t *testing.T) { } assert.Equal(t, true, found) } else { - t.Errorf("metric not available") + t.Errorf("metric keda_scaler_metrics_value not available") } } @@ -364,22 +364,22 @@ func testScaledObjectErrors(t *testing.T, data templateData) { time.Sleep(20 * time.Second) family := fetchAndParsePrometheusMetrics(t, fmt.Sprintf("curl --insecure %s", kedaOperatorPrometheusURL)) - if val, ok := family["keda_scaled_object_errors"]; ok { + if val, ok := family["keda_scaled_object_errors_total"]; ok { errCounterVal1 := getErrorMetricsValue(val) // wait for 2 seconds as pollinginterval is 2 time.Sleep(2 * time.Second) family = fetchAndParsePrometheusMetrics(t, fmt.Sprintf("curl --insecure %s", kedaOperatorPrometheusURL)) - if val, ok := family["keda_scaled_object_errors"]; ok { + if val, ok := family["keda_scaled_object_errors_total"]; ok { errCounterVal2 := getErrorMetricsValue(val) assert.NotEqual(t, errCounterVal2, float64(0)) assert.GreaterOrEqual(t, errCounterVal2, errCounterVal1) } else { - t.Errorf("metric not available") + t.Errorf("metric keda_scaled_object_errors_total not available") } } else { - t.Errorf("metric not available") + t.Errorf("metric keda_scaled_object_errors_total not available") } KubectlDeleteWithTemplate(t, data, "wrongScaledObjectTemplate", wrongScaledObjectTemplate) @@ -393,51 +393,22 @@ func testScalerErrors(t *testing.T, data templateData) { KubectlApplyWithTemplate(t, data, "wrongScaledObjectTemplate", wrongScaledObjectTemplate) family := fetchAndParsePrometheusMetrics(t, fmt.Sprintf("curl --insecure %s", kedaOperatorPrometheusURL)) - if val, ok := family["keda_scaler_errors"]; ok { + if val, ok := family["keda_scaler_errors_total"]; ok { errCounterVal1 := getErrorMetricsValue(val) // wait for 20 seconds to correctly fetch metrics. time.Sleep(20 * time.Second) - family = fetchAndParsePrometheusMetrics(t, fmt.Sprintf("curl --insecure %s", kedaOperatorPrometheusURL)) - if val, ok := family["keda_scaler_errors"]; ok { - errCounterVal2 := getErrorMetricsValue(val) - assert.NotEqual(t, errCounterVal2, float64(0)) - assert.GreaterOrEqual(t, errCounterVal2, errCounterVal1) - } else { - t.Errorf("metric not available") - } - } else { - t.Errorf("metric not available") - } - - KubectlDeleteWithTemplate(t, data, "wrongScaledObjectTemplate", wrongScaledObjectTemplate) - KubectlApplyWithTemplate(t, data, "scaledObjectTemplate", scaledObjectTemplate) -} - -func testScalerErrorsTotal(t *testing.T, data templateData) { - t.Log("--- testing scaler errors total ---") - - KubectlDeleteWithTemplate(t, data, "scaledObjectTemplate", scaledObjectTemplate) - KubectlApplyWithTemplate(t, data, "wrongScaledObjectTemplate", wrongScaledObjectTemplate) - - family := fetchAndParsePrometheusMetrics(t, fmt.Sprintf("curl --insecure %s", kedaOperatorPrometheusURL)) - if val, ok := family["keda_scaler_errors_total"]; ok { - errCounterVal1 := getErrorMetricsValue(val) - - // wait for 2 seconds as pollinginterval is 2 - time.Sleep(2 * time.Second) - family = fetchAndParsePrometheusMetrics(t, fmt.Sprintf("curl --insecure %s", kedaOperatorPrometheusURL)) if val, ok := family["keda_scaler_errors_total"]; ok { errCounterVal2 := getErrorMetricsValue(val) assert.NotEqual(t, errCounterVal2, float64(0)) assert.GreaterOrEqual(t, errCounterVal2, errCounterVal1) } else { - t.Errorf("metric not available") + t.Errorf("metric keda_scaler_errors_total not available") } } else { - t.Errorf("metric not available") + t.Errorf("metric keda_scaler_errors_total not available") } KubectlDeleteWithTemplate(t, data, "wrongScaledObjectTemplate", wrongScaledObjectTemplate) @@ -451,7 +422,7 @@ func getErrorMetricsValue(val *prommodel.MetricFamily) float64 { for _, metric := range metrics { return metric.GetCounter().GetValue() } - case "keda_scaled_object_errors": + case "keda_scaled_object_errors_total": metrics := val.GetMetric() for _, metric := range metrics { labels := metric.GetLabel() @@ -461,7 +432,7 @@ func getErrorMetricsValue(val *prommodel.MetricFamily) float64 { } } } - case "keda_scaler_errors": + case "keda_scaler_errors_total": metrics := val.GetMetric() for _, metric := range metrics { labels := metric.GetLabel() @@ -505,7 +476,7 @@ func testScalerMetricLatency(t *testing.T) { family := fetchAndParsePrometheusMetrics(t, fmt.Sprintf("curl --insecure %s", kedaOperatorPrometheusURL)) - if val, ok := family["keda_scaler_metrics_latency"]; ok { + if val, ok := family["keda_scaler_metrics_latency_seconds"]; ok { var found bool metrics := val.GetMetric() for _, metric := range metrics { @@ -519,7 +490,7 @@ func testScalerMetricLatency(t *testing.T) { } assert.Equal(t, true, found) } else { - t.Errorf("metric not available") + t.Errorf("metric keda_scaler_metrics_latency_seconds not available") } } @@ -528,7 +499,7 @@ func testScalableObjectMetrics(t *testing.T) { family := fetchAndParsePrometheusMetrics(t, fmt.Sprintf("curl --insecure %s", kedaOperatorPrometheusURL)) - if val, ok := family["keda_internal_scale_loop_latency"]; ok { + if val, ok := family["keda_internal_scale_loop_latency_seconds"]; ok { var found bool metrics := val.GetMetric() @@ -556,7 +527,7 @@ func testScalableObjectMetrics(t *testing.T) { } assert.Equal(t, true, found) } else { - t.Errorf("scaledobject metric not available") + t.Errorf("keda_internal_scale_loop_latency_seconds metric not available") } } @@ -579,7 +550,7 @@ func testScalerActiveMetric(t *testing.T) { } assert.Equal(t, true, found) } else { - t.Errorf("metric not available") + t.Errorf("metric keda_scaler_active not available") } } @@ -708,7 +679,7 @@ func checkBuildInfo(t *testing.T, families map[string]*prommodel.MetricFamily) { family, ok := families["keda_build_info"] if !ok { - t.Errorf("metric not available") + t.Errorf("metric keda_build_info not available") return } @@ -743,9 +714,9 @@ func getLatestCommit(t *testing.T) string { func checkTriggerTotalValues(t *testing.T, families map[string]*prommodel.MetricFamily, expected map[string]int) { t.Log("--- testing trigger total metrics ---") - family, ok := families["keda_trigger_totals"] + family, ok := families["keda_trigger_handled_total"] if !ok { - t.Errorf("metric not available") + t.Errorf("metric keda_trigger_handled_total not available") return } @@ -772,9 +743,9 @@ func checkTriggerTotalValues(t *testing.T, families map[string]*prommodel.Metric func checkCRTotalValues(t *testing.T, families map[string]*prommodel.MetricFamily, expected map[string]map[string]int) { t.Log("--- testing resource total metrics ---") - family, ok := families["keda_resource_totals"] + family, ok := families["keda_resource_handled_total"] if !ok { - t.Errorf("metric not available") + t.Errorf("metric keda_resource_handled_total not available") return } @@ -801,9 +772,9 @@ func checkCRTotalValues(t *testing.T, families map[string]*prommodel.MetricFamil func checkWebhookValues(t *testing.T, families map[string]*prommodel.MetricFamily) { t.Log("--- testing webhook metrics ---") - family, ok := families["keda_webhook_scaled_object_validation_errors"] + family, ok := families["keda_webhook_scaled_object_validation_errors_total"] if !ok { - t.Errorf("metric keda_webhook_scaled_object_validation_errors not available") + t.Errorf("metric keda_webhook_scaled_object_validation_errors_total not available") return } @@ -820,9 +791,9 @@ func checkWebhookValues(t *testing.T, families map[string]*prommodel.MetricFamil } assert.GreaterOrEqual(t, metricValue, 1.0, "keda_webhook_scaled_object_validation_errors has to be greater than 0") - family, ok = families["keda_webhook_scaled_object_validation_total"] + family, ok = families["keda_webhook_scaled_object_validations_total"] if !ok { - t.Errorf("metric keda_webhook_scaled_object_validation_total not available") + t.Errorf("metric keda_webhook_scaled_object_validations_total not available") return } From 2fae5f3f7143e5286afe011bfee2582d18d8e483 Mon Sep 17 00:00:00 2001 From: Bernard Grymonpon Date: Wed, 22 Nov 2023 16:13:04 +0100 Subject: [PATCH 06/16] newline at the end of json :thinking_face: Signed-off-by: Bernard Grymonpon --- config/grafana/keda-dashboard.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/config/grafana/keda-dashboard.json b/config/grafana/keda-dashboard.json index 4648e636324..0f57afd9d59 100644 --- a/config/grafana/keda-dashboard.json +++ b/config/grafana/keda-dashboard.json @@ -997,4 +997,5 @@ "uid": "asdasd8rvmMxdVk", "version": 8, "weekStart": "" -} \ No newline at end of file +} + From 7158f7f467592dd3785242ca53ecea93212fb94f Mon Sep 17 00:00:00 2001 From: Bernard Grymonpon Date: Wed, 22 Nov 2023 16:16:57 +0100 Subject: [PATCH 07/16] correct tests Signed-off-by: Bernard Grymonpon --- .../prometheus_metrics/prometheus_metrics_test.go | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/sequential/prometheus_metrics/prometheus_metrics_test.go b/tests/sequential/prometheus_metrics/prometheus_metrics_test.go index 88977d30262..e020152c4b1 100644 --- a/tests/sequential/prometheus_metrics/prometheus_metrics_test.go +++ b/tests/sequential/prometheus_metrics/prometheus_metrics_test.go @@ -288,7 +288,6 @@ func TestPrometheusMetrics(t *testing.T) { testScalerActiveMetric(t) testScaledObjectErrors(t, data) testScalerErrors(t, data) - testScalerErrorsTotal(t, data) testOperatorMetrics(t, kc, data) testMetricServerMetrics(t) testWebhookMetrics(t, data) @@ -417,11 +416,6 @@ func testScalerErrors(t *testing.T, data templateData) { func getErrorMetricsValue(val *prommodel.MetricFamily) float64 { switch val.GetName() { - case "keda_scaler_errors_total": - metrics := val.GetMetric() - for _, metric := range metrics { - return metric.GetCounter().GetValue() - } case "keda_scaled_object_errors_total": metrics := val.GetMetric() for _, metric := range metrics { From 49aaff2c6b830593b76e1ceb32c1dfa6ac05574f Mon Sep 17 00:00:00 2001 From: Bernard Grymonpon Date: Thu, 23 Nov 2023 10:25:22 +0100 Subject: [PATCH 08/16] another go at the newline issue Signed-off-by: Bernard Grymonpon --- config/grafana/keda-dashboard.json | 1 - 1 file changed, 1 deletion(-) diff --git a/config/grafana/keda-dashboard.json b/config/grafana/keda-dashboard.json index 0f57afd9d59..e6bea125949 100644 --- a/config/grafana/keda-dashboard.json +++ b/config/grafana/keda-dashboard.json @@ -998,4 +998,3 @@ "version": 8, "weekStart": "" } - From 623d4c773e01c3b23eb41f4d6331f8fce6424dd7 Mon Sep 17 00:00:00 2001 From: Bernard Grymonpon Date: Sat, 25 Nov 2023 09:53:52 +0100 Subject: [PATCH 09/16] Reworked otel metrics to align with best practices Signed-off-by: Bernard Grymonpon --- pkg/metricscollector/opentelemetry.go | 40 ++++++++++++++++------ pkg/metricscollector/opentelemetry_test.go | 30 ++++++++++++---- 2 files changed, 53 insertions(+), 17 deletions(-) diff --git a/pkg/metricscollector/opentelemetry.go b/pkg/metricscollector/opentelemetry.go index 2e5ac4a9071..2415d8d8980 100644 --- a/pkg/metricscollector/opentelemetry.go +++ b/pkg/metricscollector/opentelemetry.go @@ -23,12 +23,14 @@ const meterName = "keda-open-telemetry-metrics" const defaultNamespace = "default" var ( - meterProvider *metric.MeterProvider - meter api.Meter - otScalerErrorsCounter api.Int64Counter - otScaledObjectErrorsCounter api.Int64Counter - otTriggerTotalsCounter api.Int64UpDownCounter - otCrdTotalsCounter api.Int64UpDownCounter + meterProvider *metric.MeterProvider + meter api.Meter + otScalerErrorsCounter api.Int64Counter + otScaledObjectErrorsCounter api.Int64Counter + otTriggerTotalsCounterDeprecated api.Int64UpDownCounter + otCrdTotalsCounterDeprecated api.Int64UpDownCounter + otTriggerTotalsCounter api.Int64UpDownCounter + otCrdTotalsCounter api.Int64UpDownCounter otelScalerMetricVal OtelMetricFloat64Val otelScalerMetricsLatencyVal OtelMetricFloat64Val @@ -87,19 +89,29 @@ func initMeters() { otLog.Error(err, msg) } - otTriggerTotalsCounter, err = meter.Int64UpDownCounter("keda.trigger.totals", api.WithDescription("Total triggers")) + otTriggerTotalsCounterDeprecated, err = meter.Int64UpDownCounter("keda.trigger.totals", api.WithDescription("DEPRECATED - will be removed in 2.15 - use 'keda.triggers.count' instead")) if err != nil { otLog.Error(err, msg) } - otCrdTotalsCounter, err = meter.Int64UpDownCounter("keda.resource.totals", api.WithDescription("Total resources")) + otTriggerTotalsCounter, err = meter.Int64UpDownCounter("keda.triggers.count", api.WithDescription("Total number of triggers per trigger type handled")) + if err != nil { + otLog.Error(err, msg) + } + + otCrdTotalsCounterDeprecated, err = meter.Int64UpDownCounter("keda.resource.totals", api.WithDescription("DEPRECATED - will be removed in 2.15 - use 'keda.resources.count' instead")) + if err != nil { + otLog.Error(err, msg) + } + + otCrdTotalsCounter, err = meter.Int64UpDownCounter("keda.resources.count", api.WithDescription("Total number of KEDA custom resources per namespace for each custom resource type (CRD) handled")) if err != nil { otLog.Error(err, msg) } _, err = meter.Float64ObservableGauge( "keda.scaler.metrics.value", - api.WithDescription("Metric Value used for HPA"), + api.WithDescription("The current value for each scaler's metric that would be used by the HPA in computing the target average"), api.WithFloat64Callback(ScalerMetricValueCallback), ) if err != nil { @@ -108,7 +120,8 @@ func initMeters() { _, err = meter.Float64ObservableGauge( "keda.scaler.metrics.latency", - api.WithDescription("Scaler Metrics Latency"), + api.WithDescription("The latency of retrieving current metric from each scaler"), + api.WithUnit("s"), api.WithFloat64Callback(ScalerMetricsLatencyCallback), ) if err != nil { @@ -118,6 +131,7 @@ func initMeters() { _, err = meter.Float64ObservableGauge( "keda.internal.scale.loop.latency", api.WithDescription("Internal latency of ScaledObject/ScaledJob loop execution"), + api.WithUnit("s"), api.WithFloat64Callback(ScalableObjectLatencyCallback), ) if err != nil { @@ -126,7 +140,7 @@ func initMeters() { _, err = meter.Float64ObservableGauge( "keda.scaler.active", - api.WithDescription("Activity of a Scaler Metric"), + api.WithDescription("Indicates whether a scaler is active (1), or not (0)"), api.WithFloat64Callback(ScalerActiveCallback), ) if err != nil { @@ -282,12 +296,14 @@ func (o *OtelMetrics) RecordScaledObjectError(namespace string, scaledObject str func (o *OtelMetrics) IncrementTriggerTotal(triggerType string) { if triggerType != "" { + otTriggerTotalsCounterDeprecated.Add(context.Background(), 1, api.WithAttributes(attribute.Key("type").String(triggerType))) otTriggerTotalsCounter.Add(context.Background(), 1, api.WithAttributes(attribute.Key("type").String(triggerType))) } } func (o *OtelMetrics) DecrementTriggerTotal(triggerType string) { if triggerType != "" { + otTriggerTotalsCounterDeprecated.Add(context.Background(), -1, api.WithAttributes(attribute.Key("type").String(triggerType))) otTriggerTotalsCounter.Add(context.Background(), -1, api.WithAttributes(attribute.Key("type").String(triggerType))) } } @@ -301,6 +317,7 @@ func (o *OtelMetrics) IncrementCRDTotal(crdType, namespace string) { attribute.Key("type").String(crdType), ) + otCrdTotalsCounterDeprecated.Add(context.Background(), 1, opt) otCrdTotalsCounter.Add(context.Background(), 1, opt) } @@ -313,6 +330,7 @@ func (o *OtelMetrics) DecrementCRDTotal(crdType, namespace string) { attribute.Key("namespace").String(namespace), attribute.Key("type").String(crdType), ) + otCrdTotalsCounterDeprecated.Add(context.Background(), -1, opt) otCrdTotalsCounter.Add(context.Background(), -1, opt) } diff --git a/pkg/metricscollector/opentelemetry_test.go b/pkg/metricscollector/opentelemetry_test.go index 3b763d2017e..ab3f758f2e6 100644 --- a/pkg/metricscollector/opentelemetry_test.go +++ b/pkg/metricscollector/opentelemetry_test.go @@ -3,6 +3,7 @@ package metricscollector import ( "context" "testing" + "time" "github.com/stretchr/testify/assert" "go.opentelemetry.io/otel/sdk/metric" @@ -59,11 +60,11 @@ func TestIncrementTriggerTotal(t *testing.T) { assert.Nil(t, err) scopeMetrics := got.ScopeMetrics[0] assert.NotEqual(t, len(scopeMetrics.Metrics), 0) - buildInfo := retrieveMetric(scopeMetrics.Metrics, "keda.trigger.totals") + triggercount := retrieveMetric(scopeMetrics.Metrics, "keda.triggers.count") - assert.NotNil(t, buildInfo) + assert.NotNil(t, triggercount) - data := buildInfo.Data.(metricdata.Sum[int64]).DataPoints[0] + data := triggercount.Data.(metricdata.Sum[int64]).DataPoints[0] assert.Equal(t, data.Value, int64(1)) testOtel.DecrementTriggerTotal("testtrigger") @@ -72,10 +73,27 @@ func TestIncrementTriggerTotal(t *testing.T) { assert.Nil(t, err) scopeMetrics = got.ScopeMetrics[0] assert.NotEqual(t, len(scopeMetrics.Metrics), 0) - buildInfo = retrieveMetric(scopeMetrics.Metrics, "keda.trigger.totals") + triggercount = retrieveMetric(scopeMetrics.Metrics, "keda.triggers.count") - assert.NotNil(t, buildInfo) + assert.NotNil(t, triggercount) - data = buildInfo.Data.(metricdata.Sum[int64]).DataPoints[0] + data = triggercount.Data.(metricdata.Sum[int64]).DataPoints[0] assert.Equal(t, data.Value, int64(0)) } + +func TestLoopLatency(t *testing.T) { + testOtel.RecordScalableObjectLatency("namespace", "name", true, 500*time.Millisecond) + got := metricdata.ResourceMetrics{} + err := testReader.Collect(context.Background(), &got) + + assert.Nil(t, err) + scopeMetrics := got.ScopeMetrics[0] + assert.NotEqual(t, len(scopeMetrics.Metrics), 0) + latency := retrieveMetric(scopeMetrics.Metrics, "keda.internal.scale.loop.latency") + + assert.NotNil(t, latency) + assert.Equal(t, latency.Unit, "s") + + data := latency.Data.(metricdata.Gauge[float64]).DataPoints[0] + assert.Equal(t, data.Value, float64(0.5)) +} From 13ca4a5e9b958060477a6e055a48e68ca786d59a Mon Sep 17 00:00:00 2001 From: Bernard Grymonpon Date: Sat, 25 Nov 2023 10:10:21 +0100 Subject: [PATCH 10/16] updated E2E tests for Otel Signed-off-by: Bernard Grymonpon --- .../opentelemetry_metrics/opentelemetry_metrics_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/sequential/opentelemetry_metrics/opentelemetry_metrics_test.go b/tests/sequential/opentelemetry_metrics/opentelemetry_metrics_test.go index 57a057ec4ed..a96b01c615b 100644 --- a/tests/sequential/opentelemetry_metrics/opentelemetry_metrics_test.go +++ b/tests/sequential/opentelemetry_metrics/opentelemetry_metrics_test.go @@ -665,7 +665,7 @@ func getLatestCommit(t *testing.T) string { func checkTriggerTotalValues(t *testing.T, families map[string]*prommodel.MetricFamily, expected map[string]int) { t.Log("--- testing trigger total metrics ---") - family, ok := families["keda_trigger_totals"] + family, ok := families["keda_triggers_count"] if !ok { t.Errorf("metric not available") return @@ -694,7 +694,7 @@ func checkTriggerTotalValues(t *testing.T, families map[string]*prommodel.Metric func checkCRTotalValues(t *testing.T, families map[string]*prommodel.MetricFamily, expected map[string]map[string]int) { t.Log("--- testing resource total metrics ---") - family, ok := families["keda_resource_totals"] + family, ok := families["keda_resources_count"] if !ok { t.Errorf("metric not available") return From d72b8600a1ef6733e92503bd652229d6dbdb54c1 Mon Sep 17 00:00:00 2001 From: Bernard Grymonpon Date: Tue, 28 Nov 2023 08:33:46 +0100 Subject: [PATCH 11/16] Handled -> Registered Signed-off-by: Bernard Grymonpon --- pkg/metricscollector/opentelemetry.go | 16 +++++++-------- pkg/metricscollector/opentelemetry_test.go | 4 ++-- pkg/metricscollector/prommetrics.go | 24 +++++++++++----------- 3 files changed, 22 insertions(+), 22 deletions(-) diff --git a/pkg/metricscollector/opentelemetry.go b/pkg/metricscollector/opentelemetry.go index 2415d8d8980..8dcabd20fb2 100644 --- a/pkg/metricscollector/opentelemetry.go +++ b/pkg/metricscollector/opentelemetry.go @@ -29,8 +29,8 @@ var ( otScaledObjectErrorsCounter api.Int64Counter otTriggerTotalsCounterDeprecated api.Int64UpDownCounter otCrdTotalsCounterDeprecated api.Int64UpDownCounter - otTriggerTotalsCounter api.Int64UpDownCounter - otCrdTotalsCounter api.Int64UpDownCounter + otTriggerRegisteredTotalsCounter api.Int64UpDownCounter + otCrdRegisteredTotalsCounter api.Int64UpDownCounter otelScalerMetricVal OtelMetricFloat64Val otelScalerMetricsLatencyVal OtelMetricFloat64Val @@ -94,7 +94,7 @@ func initMeters() { otLog.Error(err, msg) } - otTriggerTotalsCounter, err = meter.Int64UpDownCounter("keda.triggers.count", api.WithDescription("Total number of triggers per trigger type handled")) + otTriggerRegisteredTotalsCounter, err = meter.Int64UpDownCounter("keda.triggers.registered.count", api.WithDescription("Total number of triggers per trigger type registered")) if err != nil { otLog.Error(err, msg) } @@ -104,7 +104,7 @@ func initMeters() { otLog.Error(err, msg) } - otCrdTotalsCounter, err = meter.Int64UpDownCounter("keda.resources.count", api.WithDescription("Total number of KEDA custom resources per namespace for each custom resource type (CRD) handled")) + otCrdRegisteredTotalsCounter, err = meter.Int64UpDownCounter("keda.resources.registered.count", api.WithDescription("Total number of KEDA custom resources per namespace for each custom resource type (CRD) registered")) if err != nil { otLog.Error(err, msg) } @@ -297,14 +297,14 @@ func (o *OtelMetrics) RecordScaledObjectError(namespace string, scaledObject str func (o *OtelMetrics) IncrementTriggerTotal(triggerType string) { if triggerType != "" { otTriggerTotalsCounterDeprecated.Add(context.Background(), 1, api.WithAttributes(attribute.Key("type").String(triggerType))) - otTriggerTotalsCounter.Add(context.Background(), 1, api.WithAttributes(attribute.Key("type").String(triggerType))) + otTriggerRegisteredTotalsCounter.Add(context.Background(), 1, api.WithAttributes(attribute.Key("type").String(triggerType))) } } func (o *OtelMetrics) DecrementTriggerTotal(triggerType string) { if triggerType != "" { otTriggerTotalsCounterDeprecated.Add(context.Background(), -1, api.WithAttributes(attribute.Key("type").String(triggerType))) - otTriggerTotalsCounter.Add(context.Background(), -1, api.WithAttributes(attribute.Key("type").String(triggerType))) + otTriggerRegisteredTotalsCounter.Add(context.Background(), -1, api.WithAttributes(attribute.Key("type").String(triggerType))) } } @@ -318,7 +318,7 @@ func (o *OtelMetrics) IncrementCRDTotal(crdType, namespace string) { ) otCrdTotalsCounterDeprecated.Add(context.Background(), 1, opt) - otCrdTotalsCounter.Add(context.Background(), 1, opt) + otCrdRegisteredTotalsCounter.Add(context.Background(), 1, opt) } func (o *OtelMetrics) DecrementCRDTotal(crdType, namespace string) { @@ -331,7 +331,7 @@ func (o *OtelMetrics) DecrementCRDTotal(crdType, namespace string) { attribute.Key("type").String(crdType), ) otCrdTotalsCounterDeprecated.Add(context.Background(), -1, opt) - otCrdTotalsCounter.Add(context.Background(), -1, opt) + otCrdRegisteredTotalsCounter.Add(context.Background(), -1, opt) } func getScalerMeasurementOption(namespace string, scaledObject string, scaler string, scalerIndex int, metric string) api.MeasurementOption { diff --git a/pkg/metricscollector/opentelemetry_test.go b/pkg/metricscollector/opentelemetry_test.go index ab3f758f2e6..1b9715a29da 100644 --- a/pkg/metricscollector/opentelemetry_test.go +++ b/pkg/metricscollector/opentelemetry_test.go @@ -60,7 +60,7 @@ func TestIncrementTriggerTotal(t *testing.T) { assert.Nil(t, err) scopeMetrics := got.ScopeMetrics[0] assert.NotEqual(t, len(scopeMetrics.Metrics), 0) - triggercount := retrieveMetric(scopeMetrics.Metrics, "keda.triggers.count") + triggercount := retrieveMetric(scopeMetrics.Metrics, "keda.triggers.registered.count") assert.NotNil(t, triggercount) @@ -73,7 +73,7 @@ func TestIncrementTriggerTotal(t *testing.T) { assert.Nil(t, err) scopeMetrics = got.ScopeMetrics[0] assert.NotEqual(t, len(scopeMetrics.Metrics), 0) - triggercount = retrieveMetric(scopeMetrics.Metrics, "keda.triggers.count") + triggercount = retrieveMetric(scopeMetrics.Metrics, "keda.triggers.registered.count") assert.NotNil(t, triggercount) diff --git a/pkg/metricscollector/prommetrics.go b/pkg/metricscollector/prommetrics.go index 046c8969058..ee436b2511c 100644 --- a/pkg/metricscollector/prommetrics.go +++ b/pkg/metricscollector/prommetrics.go @@ -139,12 +139,12 @@ var ( }, []string{"type"}, ) - triggerHandled = prometheus.NewGaugeVec( + triggerRegistered = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: DefaultPromMetricsNamespace, Subsystem: "trigger", - Name: "handled_total", - Help: "Total number of triggers per trigger type handled.", + Name: "registered_total", + Help: "Total number of triggers per trigger type registered.", }, []string{"type"}, ) @@ -157,12 +157,12 @@ var ( }, []string{"type", "namespace"}, ) - crdHandled = prometheus.NewGaugeVec( + crdRegistered = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: DefaultPromMetricsNamespace, Subsystem: "resource", - Name: "handled_total", - Help: "Total number of KEDA custom resources per namespace for each custom resource type (CRD) handled.", + Name: "registered_total", + Help: "Total number of KEDA custom resources per namespace for each custom resource type (CRD) registered.", }, []string{"type", "namespace"}, ) @@ -203,9 +203,9 @@ func NewPromMetrics() *PromMetrics { metrics.Registry.MustRegister(scaledObjectErrors) metrics.Registry.MustRegister(scaledObjectPaused) metrics.Registry.MustRegister(triggerTotalsGaugeVecDeprecated) - metrics.Registry.MustRegister(triggerHandled) + metrics.Registry.MustRegister(triggerRegistered) metrics.Registry.MustRegister(crdTotalsGaugeVecDeprecated) - metrics.Registry.MustRegister(crdHandled) + metrics.Registry.MustRegister(crdRegistered) metrics.Registry.MustRegister(buildInfo) RecordBuildInfo() @@ -307,14 +307,14 @@ func getLabels(namespace string, scaledObject string, scaler string, scalerIndex func (p *PromMetrics) IncrementTriggerTotal(triggerType string) { if triggerType != "" { - triggerHandled.WithLabelValues(triggerType).Inc() + triggerRegistered.WithLabelValues(triggerType).Inc() triggerTotalsGaugeVecDeprecated.WithLabelValues(triggerType).Inc() } } func (p *PromMetrics) DecrementTriggerTotal(triggerType string) { if triggerType != "" { - triggerHandled.WithLabelValues(triggerType).Dec() + triggerRegistered.WithLabelValues(triggerType).Dec() triggerTotalsGaugeVecDeprecated.WithLabelValues(triggerType).Dec() } } @@ -324,7 +324,7 @@ func (p *PromMetrics) IncrementCRDTotal(crdType, namespace string) { namespace = defaultNamespace } - crdHandled.WithLabelValues(crdType, namespace).Inc() + crdRegistered.WithLabelValues(crdType, namespace).Inc() crdTotalsGaugeVecDeprecated.WithLabelValues(crdType, namespace).Inc() } @@ -333,6 +333,6 @@ func (p *PromMetrics) DecrementCRDTotal(crdType, namespace string) { namespace = defaultNamespace } - crdHandled.WithLabelValues(crdType, namespace).Dec() + crdRegistered.WithLabelValues(crdType, namespace).Dec() crdTotalsGaugeVecDeprecated.WithLabelValues(crdType, namespace).Dec() } From fd4582618e653c0f1439df31fc8e1d239a3bf4e5 Mon Sep 17 00:00:00 2001 From: Bernard Grymonpon Date: Tue, 28 Nov 2023 08:59:24 +0100 Subject: [PATCH 12/16] align namespace naming to not be pluralized Signed-off-by: Bernard Grymonpon --- pkg/metricscollector/opentelemetry.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/metricscollector/opentelemetry.go b/pkg/metricscollector/opentelemetry.go index 8dcabd20fb2..c0ba2262c9c 100644 --- a/pkg/metricscollector/opentelemetry.go +++ b/pkg/metricscollector/opentelemetry.go @@ -94,7 +94,7 @@ func initMeters() { otLog.Error(err, msg) } - otTriggerRegisteredTotalsCounter, err = meter.Int64UpDownCounter("keda.triggers.registered.count", api.WithDescription("Total number of triggers per trigger type registered")) + otTriggerRegisteredTotalsCounter, err = meter.Int64UpDownCounter("keda.trigger.registered.count", api.WithDescription("Total number of triggers per trigger type registered")) if err != nil { otLog.Error(err, msg) } @@ -104,7 +104,7 @@ func initMeters() { otLog.Error(err, msg) } - otCrdRegisteredTotalsCounter, err = meter.Int64UpDownCounter("keda.resources.registered.count", api.WithDescription("Total number of KEDA custom resources per namespace for each custom resource type (CRD) registered")) + otCrdRegisteredTotalsCounter, err = meter.Int64UpDownCounter("keda.resource.registered.count", api.WithDescription("Total number of KEDA custom resources per namespace for each custom resource type (CRD) registered")) if err != nil { otLog.Error(err, msg) } From f97a6504fea6a16d0a54c22b6cf8bf65f37199f3 Mon Sep 17 00:00:00 2001 From: Bernard Grymonpon Date: Tue, 28 Nov 2023 09:11:15 +0100 Subject: [PATCH 13/16] rewrite the metric names in the tests Signed-off-by: Bernard Grymonpon --- pkg/metricscollector/opentelemetry_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/metricscollector/opentelemetry_test.go b/pkg/metricscollector/opentelemetry_test.go index 1b9715a29da..58096e636a8 100644 --- a/pkg/metricscollector/opentelemetry_test.go +++ b/pkg/metricscollector/opentelemetry_test.go @@ -60,7 +60,7 @@ func TestIncrementTriggerTotal(t *testing.T) { assert.Nil(t, err) scopeMetrics := got.ScopeMetrics[0] assert.NotEqual(t, len(scopeMetrics.Metrics), 0) - triggercount := retrieveMetric(scopeMetrics.Metrics, "keda.triggers.registered.count") + triggercount := retrieveMetric(scopeMetrics.Metrics, "keda.trigger.registered.count") assert.NotNil(t, triggercount) @@ -73,7 +73,7 @@ func TestIncrementTriggerTotal(t *testing.T) { assert.Nil(t, err) scopeMetrics = got.ScopeMetrics[0] assert.NotEqual(t, len(scopeMetrics.Metrics), 0) - triggercount = retrieveMetric(scopeMetrics.Metrics, "keda.triggers.registered.count") + triggercount = retrieveMetric(scopeMetrics.Metrics, "keda.trigger.registered.count") assert.NotNil(t, triggercount) From 36ca55a53cd2a13784b468a2680f724df56c978b Mon Sep 17 00:00:00 2001 From: Bernard Grymonpon Date: Tue, 28 Nov 2023 09:15:41 +0100 Subject: [PATCH 14/16] even more rewriting ... :facepalm: Signed-off-by: Bernard Grymonpon --- .../prometheus_metrics/prometheus_metrics_test.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/sequential/prometheus_metrics/prometheus_metrics_test.go b/tests/sequential/prometheus_metrics/prometheus_metrics_test.go index e020152c4b1..9191979321d 100644 --- a/tests/sequential/prometheus_metrics/prometheus_metrics_test.go +++ b/tests/sequential/prometheus_metrics/prometheus_metrics_test.go @@ -708,9 +708,9 @@ func getLatestCommit(t *testing.T) string { func checkTriggerTotalValues(t *testing.T, families map[string]*prommodel.MetricFamily, expected map[string]int) { t.Log("--- testing trigger total metrics ---") - family, ok := families["keda_trigger_handled_total"] + family, ok := families["keda_trigger_registered_total"] if !ok { - t.Errorf("metric keda_trigger_handled_total not available") + t.Errorf("metric keda_trigger_registered_total not available") return } @@ -737,9 +737,9 @@ func checkTriggerTotalValues(t *testing.T, families map[string]*prommodel.Metric func checkCRTotalValues(t *testing.T, families map[string]*prommodel.MetricFamily, expected map[string]map[string]int) { t.Log("--- testing resource total metrics ---") - family, ok := families["keda_resource_handled_total"] + family, ok := families["keda_resource_registered_total"] if !ok { - t.Errorf("metric keda_resource_handled_total not available") + t.Errorf("metric keda_resource_registered_total not available") return } From 3a85292a2f174732f0e27353f3cb3475d87b8475 Mon Sep 17 00:00:00 2001 From: Bernard Grymonpon Date: Mon, 5 Feb 2024 11:00:57 +0100 Subject: [PATCH 15/16] Tuning merge Signed-off-by: Bernard Grymonpon --- pkg/metricscollector/opentelemetry.go | 4 ++-- pkg/metricscollector/prommetrics.go | 16 +++++++--------- .../webhook/webhook_prommetrics.go | 4 ++-- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/pkg/metricscollector/opentelemetry.go b/pkg/metricscollector/opentelemetry.go index 88666fde69a..fe807260c04 100644 --- a/pkg/metricscollector/opentelemetry.go +++ b/pkg/metricscollector/opentelemetry.go @@ -98,7 +98,7 @@ func initMeters() { otLog.Error(err, msg) } - otTriggerTotalsCounterDeprecated, err = meter.Int64UpDownCounter("keda.trigger.totals", api.WithDescription("DEPRECATED - will be removed in 2.15 - use 'keda.triggers.count' instead")) + otTriggerTotalsCounterDeprecated, err = meter.Int64UpDownCounter("keda.trigger.totals", api.WithDescription("DEPRECATED - will be removed in 2.16 - use 'keda.trigger.registered.count' instead")) if err != nil { otLog.Error(err, msg) } @@ -108,7 +108,7 @@ func initMeters() { otLog.Error(err, msg) } - otCrdTotalsCounterDeprecated, err = meter.Int64UpDownCounter("keda.resource.totals", api.WithDescription("DEPRECATED - will be removed in 2.15 - use 'keda.resources.count' instead")) + otCrdTotalsCounterDeprecated, err = meter.Int64UpDownCounter("keda.resource.totals", api.WithDescription("DEPRECATED - will be removed in 2.16 - use 'keda.resource.registered.count' instead")) if err != nil { otLog.Error(err, msg) } diff --git a/pkg/metricscollector/prommetrics.go b/pkg/metricscollector/prommetrics.go index 62e51f448f6..38526d9ae1d 100644 --- a/pkg/metricscollector/prommetrics.go +++ b/pkg/metricscollector/prommetrics.go @@ -45,7 +45,7 @@ var ( Namespace: DefaultPromMetricsNamespace, Subsystem: "scaler", Name: "errors_total", - Help: "DEPRECATED - will be removed in 2.15 - use a `sum(scaler_errors_total{scaler!=\"\"})` over all scalers", + Help: "DEPRECATED - will be removed in 2.16 - use a `sum(scaler_errors_total{scaler!=\"\"})` over all scalers", }, []string{}, ) @@ -63,7 +63,7 @@ var ( Namespace: DefaultPromMetricsNamespace, Subsystem: "scaler", Name: "metrics_latency", - Help: "DEPRECATED - will be removed in 2.15 use 'scaler_metrics_latency_seconds' instead.", + Help: "DEPRECATED - will be removed in 2.16 use 'scaler_metrics_latency_seconds' instead.", }, metricLabels, ) @@ -99,7 +99,7 @@ var ( Namespace: DefaultPromMetricsNamespace, Subsystem: "scaler", Name: "errors", - Help: "DEPRECATED - will be removed in 2.15 - use 'scaler_errors_total' instead.", + Help: "DEPRECATED - will be removed in 2.16 - use 'scaler_errors_total' instead.", }, metricLabels, ) @@ -117,7 +117,7 @@ var ( Namespace: DefaultPromMetricsNamespace, Subsystem: "scaled_object", Name: "errors", - Help: "DEPRECATED - will be removed in 2.15 - use 'scaled_object_errors_total' instead.", + Help: "DEPRECATED - will be removed in 2.16 - use 'scaled_object_errors_total' instead.", }, []string{"namespace", "scaledObject"}, ) @@ -146,7 +146,7 @@ var ( Namespace: DefaultPromMetricsNamespace, Subsystem: "trigger", Name: "totals", - Help: "DEPRECATED - will be removed in 2.15 - use 'trigger_registered_total' instead.", + Help: "DEPRECATED - will be removed in 2.16 - use 'trigger_registered_total' instead.", }, []string{"type"}, ) @@ -164,7 +164,7 @@ var ( Namespace: DefaultPromMetricsNamespace, Subsystem: "resource", Name: "totals", - Help: "DEPRECATED - will be removed in 2.15 - use 'resource_handled_total' instead.", + Help: "DEPRECATED - will be removed in 2.16 - use 'resource_handled_total' instead.", }, []string{"type", "namespace"}, ) @@ -182,7 +182,7 @@ var ( Namespace: DefaultPromMetricsNamespace, Subsystem: "internal_scale_loop", Name: "latency", - Help: "DEPRECATED - will be removed in 2.15 - use 'internal_scale_loop_latency_seconds' instead.", + Help: "DEPRECATED - will be removed in 2.16 - use 'internal_scale_loop_latency_seconds' instead.", }, []string{"namespace", "type", "resource"}, ) @@ -240,8 +240,6 @@ func NewPromMetrics() *PromMetrics { metrics.Registry.MustRegister(crdRegistered) metrics.Registry.MustRegister(scaledJobErrors) - // metrics.Registry.MustRegister(triggerTotalsGaugeVec) - // metrics.Registry.MustRegister(crdTotalsGaugeVec) metrics.Registry.MustRegister(buildInfo) metrics.Registry.MustRegister(cloudeventEmitted) diff --git a/pkg/metricscollector/webhook/webhook_prommetrics.go b/pkg/metricscollector/webhook/webhook_prommetrics.go index 15c00b4f7f7..329d3e4b4e3 100644 --- a/pkg/metricscollector/webhook/webhook_prommetrics.go +++ b/pkg/metricscollector/webhook/webhook_prommetrics.go @@ -31,7 +31,7 @@ var ( Namespace: DefaultPromMetricsNamespace, Subsystem: "webhook", Name: "scaled_object_validation_total", - Help: "DEPRECATED - will be removed in 2.15 - Use `scaled_object_validations_total` instead.", + Help: "DEPRECATED - will be removed in 2.16 - Use `scaled_object_validations_total` instead.", }, []string{"namespace", "action"}, ) @@ -49,7 +49,7 @@ var ( Namespace: DefaultPromMetricsNamespace, Subsystem: "webhook", Name: "scaled_object_validation_errors", - Help: "DEPRECATED - will be removed in 2.15 - Use `scaled_object_validation_errors_total` instead.", + Help: "DEPRECATED - will be removed in 2.16 - Use `scaled_object_validation_errors_total` instead.", }, []string{"namespace", "action", "reason"}, ) From 388be0f93ac0b716dbcc5549c33e8e90499f893b Mon Sep 17 00:00:00 2001 From: Bernard Grymonpon Date: Mon, 5 Feb 2024 15:12:03 +0100 Subject: [PATCH 16/16] fixing tests Signed-off-by: Bernard Grymonpon --- pkg/metricscollector/prommetrics.go | 2 +- .../prometheus_metrics_test.go | 30 ------------------- 2 files changed, 1 insertion(+), 31 deletions(-) diff --git a/pkg/metricscollector/prommetrics.go b/pkg/metricscollector/prommetrics.go index 38526d9ae1d..1b44852869d 100644 --- a/pkg/metricscollector/prommetrics.go +++ b/pkg/metricscollector/prommetrics.go @@ -261,7 +261,7 @@ func (p *PromMetrics) RecordScalerMetric(namespace string, scaledResource string // RecordScalerLatency create a measurement of the latency to external metric func (p *PromMetrics) RecordScalerLatency(namespace string, scaledResource string, scaler string, triggerIndex int, metric string, isScaledObject bool, value time.Duration) { - scalerMetricsLatency.With(getLabels(namespace, scaledResource, scaler, triggerIndex, metric, isScaledObject)).Set(float64(value.Seconds())) + scalerMetricsLatency.With(getLabels(namespace, scaledResource, scaler, triggerIndex, metric, isScaledObject)).Set(value.Seconds()) scalerMetricsLatencyDeprecated.With(getLabels(namespace, scaledResource, scaler, triggerIndex, metric, isScaledObject)).Set(float64(value.Milliseconds())) } diff --git a/tests/sequential/prometheus_metrics/prometheus_metrics_test.go b/tests/sequential/prometheus_metrics/prometheus_metrics_test.go index 5c3ddff24ea..93ac598369c 100644 --- a/tests/sequential/prometheus_metrics/prometheus_metrics_test.go +++ b/tests/sequential/prometheus_metrics/prometheus_metrics_test.go @@ -630,36 +630,6 @@ func testScalerErrors(t *testing.T, data templateData) { KubectlApplyWithTemplate(t, data, "scaledObjectTemplate", scaledObjectTemplate) } -func testScalerErrorsTotal(t *testing.T, data templateData) { - t.Log("--- testing scaler errors total ---") - - KubectlDeleteWithTemplate(t, data, "scaledObjectTemplate", scaledObjectTemplate) - KubectlApplyWithTemplate(t, data, "wrongScaledObjectTemplate", wrongScaledObjectTemplate) - - family := fetchAndParsePrometheusMetrics(t, fmt.Sprintf("curl --insecure %s", kedaOperatorPrometheusURL)) - val, ok := family["keda_scaler_errors_total"] - assert.True(t, ok, "keda_scaler_errors_total not available") - if ok { - errCounterVal1 := getErrorMetricsValue(val) - - // wait for 2 seconds as pollinginterval is 2 - time.Sleep(2 * time.Second) - - family = fetchAndParsePrometheusMetrics(t, fmt.Sprintf("curl --insecure %s", kedaOperatorPrometheusURL)) - val, ok := family["keda_scaler_errors_total"] - assert.True(t, ok, "keda_scaler_errors_total not available") - if ok { - errCounterVal2 := getErrorMetricsValue(val) - assert.NotEqual(t, errCounterVal2, float64(0)) - assert.GreaterOrEqual(t, errCounterVal2, errCounterVal1) - } - } - - KubectlDeleteWithTemplate(t, data, "wrongScaledObjectTemplate", wrongScaledObjectTemplate) - time.Sleep(2 * time.Second) - KubectlApplyWithTemplate(t, data, "scaledObjectTemplate", scaledObjectTemplate) -} - func getErrorMetricsValue(val *prommodel.MetricFamily) float64 { switch val.GetName() { case "keda_scaled_object_errors_total":