Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reworks the prometheus metrics to adhere to best practices #5174

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ Here is an overview of all new **experimental** features:
### Improvements

- **General**: Add OPENTELEMETRY flag in e2e test YAML ([#5375](https://github.com/kedacore/keda/issues/5375))
- **General**: Renamed Prometheus metrics to include units and `total` where approriate ([#4854](https://github.com/kedacore/keda/issues/4854))

### Fixes

Expand All @@ -74,7 +75,7 @@ You can find all deprecations in [this overview](https://github.com/kedacore/ked

New deprecation(s):

- TODO ([#XXX](https://github.com/kedacore/keda/issues/XXX))
- Various Prometheus metrics have been renamed to follow the preferred naming conventions. The old ones are still available, but will be removed in the future ([#4854](https://github.com/kedacore/keda/issues/4854)).

### Breaking Changes

Expand Down
6 changes: 3 additions & 3 deletions config/grafana/keda-dashboard.json
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@
"uid": "${datasource}"
},
"editorMode": "code",
"expr": "sum by(job) (rate(keda_scaler_errors{}[5m]))",
"expr": "sum by(job) (rate(keda_scaler_errors_total{}[5m]))",
"legendFormat": "{{ job }}",
"range": true,
"refId": "A"
Expand Down Expand Up @@ -313,7 +313,7 @@
"uid": "${datasource}"
},
"editorMode": "code",
"expr": "sum by(scaler) (rate(keda_scaler_errors{exported_namespace=~\"$namespace\", scaledObject=~\"$scaledObject\", scaler=~\"$scaler\"}[5m]))",
"expr": "sum by(scaler) (rate(keda_scaler_errors_total{exported_namespace=~\"$namespace\", scaledObject=~\"$scaledObject\", scaler=~\"$scaler\"}[5m]))",
"legendFormat": "{{ scaler }}",
"range": true,
"refId": "A"
Expand Down Expand Up @@ -423,7 +423,7 @@
"uid": "${datasource}"
},
"editorMode": "code",
"expr": "sum by(scaledObject) (rate(keda_scaled_object_errors{exported_namespace=~\"$namespace\", scaledObject=~\"$scaledObject\"}[5m]))",
"expr": "sum by(scaledObject) (rate(keda_scaled_object_errors_total{exported_namespace=~\"$namespace\", scaledObject=~\"$scaledObject\"}[5m]))",
"legendFormat": "{{ scaledObject }}",
"range": true,
"refId": "A"
Expand Down
10 changes: 6 additions & 4 deletions pkg/metricscollector/metricscollectors.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ limitations under the License.

package metricscollector

import "time"

const (
ClusterTriggerAuthenticationResource = "cluster_trigger_authentication"
TriggerAuthenticationResource = "trigger_authentication"
Expand All @@ -34,10 +36,10 @@ type MetricsCollector interface {
RecordScalerMetric(namespace string, scaledResource string, scaler string, triggerIndex int, metric string, isScaledObject bool, value float64)

// RecordScalerLatency create a measurement of the latency to external metric
RecordScalerLatency(namespace string, scaledResource string, scaler string, triggerIndex int, metric string, isScaledObject bool, value float64)
RecordScalerLatency(namespace string, scaledResource string, scaler string, triggerIndex int, metric string, isScaledObject bool, value time.Duration)

// RecordScalableObjectLatency create a measurement of the latency executing scalable object loop
RecordScalableObjectLatency(namespace string, name string, isScaledObject bool, value float64)
RecordScalableObjectLatency(namespace string, name string, isScaledObject bool, value time.Duration)

// RecordScalerActive create a measurement of the activity of the scaler
RecordScalerActive(namespace string, scaledResource string, scaler string, triggerIndex int, metric string, isScaledObject bool, active bool)
Expand Down Expand Up @@ -92,14 +94,14 @@ func RecordScalerMetric(namespace string, scaledObject string, scaler string, tr
}

// RecordScalerLatency create a measurement of the latency to external metric
func RecordScalerLatency(namespace string, scaledObject string, scaler string, triggerIndex int, metric string, isScaledObject bool, value float64) {
func RecordScalerLatency(namespace string, scaledObject string, scaler string, triggerIndex int, metric string, isScaledObject bool, value time.Duration) {
for _, element := range collectors {
element.RecordScalerLatency(namespace, scaledObject, scaler, triggerIndex, metric, isScaledObject, value)
}
}

// RecordScalableObjectLatency create a measurement of the latency executing scalable object loop
func RecordScalableObjectLatency(namespace string, name string, isScaledObject bool, value float64) {
func RecordScalableObjectLatency(namespace string, name string, isScaledObject bool, value time.Duration) {
for _, element := range collectors {
element.RecordScalableObjectLatency(namespace, name, isScaledObject, value)
}
Expand Down
59 changes: 39 additions & 20 deletions pkg/metricscollector/opentelemetry.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"fmt"
"runtime"
"strconv"
"time"

"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
Expand All @@ -22,13 +23,15 @@ const meterName = "keda-open-telemetry-metrics"
const defaultNamespace = "default"

var (
meterProvider *metric.MeterProvider
meter api.Meter
otScalerErrorsCounter api.Int64Counter
otScaledObjectErrorsCounter api.Int64Counter
otScaledJobErrorsCounter api.Int64Counter
otTriggerTotalsCounter api.Int64UpDownCounter
otCrdTotalsCounter api.Int64UpDownCounter
meterProvider *metric.MeterProvider
meter api.Meter
otScalerErrorsCounter api.Int64Counter
otScaledObjectErrorsCounter api.Int64Counter
otScaledJobErrorsCounter api.Int64Counter
otTriggerTotalsCounterDeprecated api.Int64UpDownCounter
otCrdTotalsCounterDeprecated api.Int64UpDownCounter
otTriggerRegisteredTotalsCounter api.Int64UpDownCounter
otCrdRegisteredTotalsCounter api.Int64UpDownCounter

otelScalerMetricVal OtelMetricFloat64Val
otelScalerMetricsLatencyVal OtelMetricFloat64Val
Expand Down Expand Up @@ -95,19 +98,29 @@ func initMeters() {
otLog.Error(err, msg)
}

otTriggerTotalsCounter, err = meter.Int64UpDownCounter("keda.trigger.totals", api.WithDescription("Total triggers"))
otTriggerTotalsCounterDeprecated, err = meter.Int64UpDownCounter("keda.trigger.totals", api.WithDescription("DEPRECATED - will be removed in 2.16 - use 'keda.trigger.registered.count' instead"))
if err != nil {
otLog.Error(err, msg)
}

otCrdTotalsCounter, err = meter.Int64UpDownCounter("keda.resource.totals", api.WithDescription("Total resources"))
otTriggerRegisteredTotalsCounter, err = meter.Int64UpDownCounter("keda.trigger.registered.count", api.WithDescription("Total number of triggers per trigger type registered"))
if err != nil {
otLog.Error(err, msg)
}

otCrdTotalsCounterDeprecated, err = meter.Int64UpDownCounter("keda.resource.totals", api.WithDescription("DEPRECATED - will be removed in 2.16 - use 'keda.resource.registered.count' instead"))
if err != nil {
otLog.Error(err, msg)
}

otCrdRegisteredTotalsCounter, err = meter.Int64UpDownCounter("keda.resource.registered.count", api.WithDescription("Total number of KEDA custom resources per namespace for each custom resource type (CRD) registered"))
if err != nil {
otLog.Error(err, msg)
}

_, err = meter.Float64ObservableGauge(
"keda.scaler.metrics.value",
api.WithDescription("Metric Value used for HPA"),
api.WithDescription("The current value for each scaler's metric that would be used by the HPA in computing the target average"),
api.WithFloat64Callback(ScalerMetricValueCallback),
)
if err != nil {
Expand All @@ -116,7 +129,8 @@ func initMeters() {

_, err = meter.Float64ObservableGauge(
"keda.scaler.metrics.latency",
api.WithDescription("Scaler Metrics Latency"),
api.WithDescription("The latency of retrieving current metric from each scaler"),
api.WithUnit("s"),
api.WithFloat64Callback(ScalerMetricsLatencyCallback),
)
if err != nil {
Expand All @@ -126,6 +140,7 @@ func initMeters() {
_, err = meter.Float64ObservableGauge(
"keda.internal.scale.loop.latency",
api.WithDescription("Internal latency of ScaledObject/ScaledJob loop execution"),
api.WithUnit("s"),
api.WithFloat64Callback(ScalableObjectLatencyCallback),
)
if err != nil {
Expand All @@ -134,7 +149,7 @@ func initMeters() {

_, err = meter.Float64ObservableGauge(
"keda.scaler.active",
api.WithDescription("Activity of a Scaler Metric"),
api.WithDescription("Indicates whether a scaler is active (1), or not (0)"),
api.WithFloat64Callback(ScalerActiveCallback),
)
if err != nil {
Expand Down Expand Up @@ -208,8 +223,8 @@ func ScalerMetricsLatencyCallback(_ context.Context, obsrv api.Float64Observer)
}

// RecordScalerLatency create a measurement of the latency to external metric
func (o *OtelMetrics) RecordScalerLatency(namespace string, scaledResource string, scaler string, triggerIndex int, metric string, isScaledObject bool, value float64) {
otelScalerMetricsLatencyVal.val = value
func (o *OtelMetrics) RecordScalerLatency(namespace string, scaledResource string, scaler string, triggerIndex int, metric string, isScaledObject bool, value time.Duration) {
otelScalerMetricsLatencyVal.val = value.Seconds()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
otelScalerMetricsLatencyVal.val = value.Seconds()
otelScalerMetricsLatencyVal.val = value.Milliseconds()

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess it's been settled that KEDA will use Seconds in opentelemetry metrics.

otelScalerMetricsLatencyVal.measurementOption = getScalerMeasurementOption(namespace, scaledResource, scaler, triggerIndex, metric, isScaledObject)
}

Expand All @@ -222,7 +237,7 @@ func ScalableObjectLatencyCallback(_ context.Context, obsrv api.Float64Observer)
}

// RecordScalableObjectLatency create a measurement of the latency executing scalable object loop
func (o *OtelMetrics) RecordScalableObjectLatency(namespace string, name string, isScaledObject bool, value float64) {
func (o *OtelMetrics) RecordScalableObjectLatency(namespace string, name string, isScaledObject bool, value time.Duration) {
resourceType := "scaledjob"
if isScaledObject {
resourceType = "scaledobject"
Expand All @@ -233,7 +248,7 @@ func (o *OtelMetrics) RecordScalableObjectLatency(namespace string, name string,
attribute.Key("type").String(resourceType),
attribute.Key("name").String(name))

otelInternalLoopLatencyVal.val = value
otelInternalLoopLatencyVal.val = value.Seconds()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
otelInternalLoopLatencyVal.val = value.Seconds()
otelInternalLoopLatencyVal.val = value.Milliseconds()

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess it's been settled that KEDA will use Seconds in opentelemetry metrics.

otelInternalLoopLatencyVal.measurementOption = opt
}

Expand Down Expand Up @@ -315,13 +330,15 @@ func (o *OtelMetrics) RecordScaledJobError(namespace string, scaledJob string, e

func (o *OtelMetrics) IncrementTriggerTotal(triggerType string) {
if triggerType != "" {
otTriggerTotalsCounter.Add(context.Background(), 1, api.WithAttributes(attribute.Key("type").String(triggerType)))
otTriggerTotalsCounterDeprecated.Add(context.Background(), 1, api.WithAttributes(attribute.Key("type").String(triggerType)))
otTriggerRegisteredTotalsCounter.Add(context.Background(), 1, api.WithAttributes(attribute.Key("type").String(triggerType)))
}
}

func (o *OtelMetrics) DecrementTriggerTotal(triggerType string) {
if triggerType != "" {
otTriggerTotalsCounter.Add(context.Background(), -1, api.WithAttributes(attribute.Key("type").String(triggerType)))
otTriggerTotalsCounterDeprecated.Add(context.Background(), -1, api.WithAttributes(attribute.Key("type").String(triggerType)))
otTriggerRegisteredTotalsCounter.Add(context.Background(), -1, api.WithAttributes(attribute.Key("type").String(triggerType)))
}
}

Expand All @@ -334,7 +351,8 @@ func (o *OtelMetrics) IncrementCRDTotal(crdType, namespace string) {
attribute.Key("type").String(crdType),
)

otCrdTotalsCounter.Add(context.Background(), 1, opt)
otCrdTotalsCounterDeprecated.Add(context.Background(), 1, opt)
otCrdRegisteredTotalsCounter.Add(context.Background(), 1, opt)
}

func (o *OtelMetrics) DecrementCRDTotal(crdType, namespace string) {
Expand All @@ -346,7 +364,8 @@ func (o *OtelMetrics) DecrementCRDTotal(crdType, namespace string) {
attribute.Key("namespace").String(namespace),
attribute.Key("type").String(crdType),
)
otCrdTotalsCounter.Add(context.Background(), -1, opt)
otCrdTotalsCounterDeprecated.Add(context.Background(), -1, opt)
otCrdRegisteredTotalsCounter.Add(context.Background(), -1, opt)
}

func getScalerMeasurementOption(namespace string, scaledResource string, scaler string, triggerIndex int, metric string, isScaledObject bool) api.MeasurementOption {
Expand Down
30 changes: 24 additions & 6 deletions pkg/metricscollector/opentelemetry_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package metricscollector
import (
"context"
"testing"
"time"

"github.com/stretchr/testify/assert"
"go.opentelemetry.io/otel/sdk/metric"
Expand Down Expand Up @@ -59,11 +60,11 @@ func TestIncrementTriggerTotal(t *testing.T) {
assert.Nil(t, err)
scopeMetrics := got.ScopeMetrics[0]
assert.NotEqual(t, len(scopeMetrics.Metrics), 0)
buildInfo := retrieveMetric(scopeMetrics.Metrics, "keda.trigger.totals")
triggercount := retrieveMetric(scopeMetrics.Metrics, "keda.trigger.registered.count")

assert.NotNil(t, buildInfo)
assert.NotNil(t, triggercount)

data := buildInfo.Data.(metricdata.Sum[int64]).DataPoints[0]
data := triggercount.Data.(metricdata.Sum[int64]).DataPoints[0]
assert.Equal(t, data.Value, int64(1))

testOtel.DecrementTriggerTotal("testtrigger")
Expand All @@ -72,10 +73,27 @@ func TestIncrementTriggerTotal(t *testing.T) {
assert.Nil(t, err)
scopeMetrics = got.ScopeMetrics[0]
assert.NotEqual(t, len(scopeMetrics.Metrics), 0)
buildInfo = retrieveMetric(scopeMetrics.Metrics, "keda.trigger.totals")
triggercount = retrieveMetric(scopeMetrics.Metrics, "keda.trigger.registered.count")

assert.NotNil(t, buildInfo)
assert.NotNil(t, triggercount)

data = buildInfo.Data.(metricdata.Sum[int64]).DataPoints[0]
data = triggercount.Data.(metricdata.Sum[int64]).DataPoints[0]
assert.Equal(t, data.Value, int64(0))
}

func TestLoopLatency(t *testing.T) {
testOtel.RecordScalableObjectLatency("namespace", "name", true, 500*time.Millisecond)
got := metricdata.ResourceMetrics{}
err := testReader.Collect(context.Background(), &got)

assert.Nil(t, err)
scopeMetrics := got.ScopeMetrics[0]
assert.NotEqual(t, len(scopeMetrics.Metrics), 0)
latency := retrieveMetric(scopeMetrics.Metrics, "keda.internal.scale.loop.latency")

assert.NotNil(t, latency)
assert.Equal(t, latency.Unit, "s")

data := latency.Data.(metricdata.Gauge[float64]).DataPoints[0]
assert.Equal(t, data.Value, float64(0.5))
}
Loading
Loading