From 1a035e4f2768d7ff9e9be63d8ae6bc730c5cd31d Mon Sep 17 00:00:00 2001 From: Paschalis Tsilias Date: Wed, 3 Apr 2024 10:39:01 +0300 Subject: [PATCH] flow: separate component and controller path IDs into new labels (#6786) Signed-off-by: Paschalis Tsilias --- internal/flow/internal/controller/loader.go | 14 ++++- internal/flow/internal/controller/metrics.go | 16 +++--- .../controller/node_builtin_component.go | 6 +- .../controller/node_builtin_component_test.go | 25 +++++++++ .../internal/controller/node_config_import.go | 6 +- .../controller/node_custom_component.go | 3 +- .../alerts/controller.libsonnet | 2 +- .../dashboards/controller.libsonnet | 4 +- .../dashboards/prometheus.libsonnet | 55 ++++++++++--------- 9 files changed, 86 insertions(+), 45 deletions(-) diff --git a/internal/flow/internal/controller/loader.go b/internal/flow/internal/controller/loader.go index d9b328160497..2514d9f4c873 100644 --- a/internal/flow/internal/controller/loader.go +++ b/internal/flow/internal/controller/loader.go @@ -75,12 +75,14 @@ func NewLoader(opts LoaderOptions) *Loader { reg = opts.ComponentRegistry ) + parent, id := splitPath(globals.ControllerID) + if reg == nil { reg = NewDefaultComponentRegistry(opts.ComponentGlobals.MinStability) } l := &Loader{ - log: log.With(globals.Logger, "controller_id", globals.ControllerID), + log: log.With(globals.Logger, "controller_path", parent, "controller_id", id), tracer: tracing.WrapTracerForLoader(globals.TraceProvider, globals.ControllerID), globals: globals, services: services, @@ -99,9 +101,9 @@ func NewLoader(opts LoaderOptions) *Loader { graph: &dag.Graph{}, originalGraph: &dag.Graph{}, cache: newValueCache(), - cm: newControllerMetrics(globals.ControllerID), + cm: newControllerMetrics(parent, id), } - l.cc = newControllerCollector(l, globals.ControllerID) + l.cc = newControllerCollector(l, parent, id) if globals.Registerer != nil { globals.Registerer.MustRegister(l.cc) @@ -909,3 +911,9 @@ func (l *Loader) collectCustomComponentReferences(stmts ast.Body, uniqueReferenc } } } + +func splitPath(id string) (string, string) { + parent, id := path.Split(id) + parent, _ = strings.CutSuffix(parent, "/") + return "/" + parent, id +} diff --git a/internal/flow/internal/controller/metrics.go b/internal/flow/internal/controller/metrics.go index e60d8a51e391..f640f11b3253 100644 --- a/internal/flow/internal/controller/metrics.go +++ b/internal/flow/internal/controller/metrics.go @@ -18,7 +18,7 @@ type controllerMetrics struct { } // newControllerMetrics inits the metrics for the components controller -func newControllerMetrics(id string) *controllerMetrics { +func newControllerMetrics(parent, id string) *controllerMetrics { cm := &controllerMetrics{ slowComponentThreshold: 1 * time.Minute, } @@ -31,14 +31,14 @@ func newControllerMetrics(id string) *controllerMetrics { cm.controllerEvaluation = prometheus.NewGauge(prometheus.GaugeOpts{ Name: "agent_component_controller_evaluating", Help: "Tracks if the controller is currently in the middle of a graph evaluation", - ConstLabels: map[string]string{"controller_id": id}, + ConstLabels: map[string]string{"controller_path": parent, "controller_id": id}, }) cm.componentEvaluationTime = prometheus.NewHistogram( prometheus.HistogramOpts{ Name: "agent_component_evaluation_seconds", Help: "Time spent performing component evaluation", - ConstLabels: map[string]string{"controller_id": id}, + ConstLabels: map[string]string{"controller_path": parent, "controller_id": id}, Buckets: evaluationTimesBuckets, NativeHistogramBucketFactor: 1.1, NativeHistogramMaxBucketNumber: 100, @@ -49,7 +49,7 @@ func newControllerMetrics(id string) *controllerMetrics { prometheus.HistogramOpts{ Name: "agent_component_dependencies_wait_seconds", Help: "Time spent by components waiting to be evaluated after their dependency is updated.", - ConstLabels: map[string]string{"controller_id": id}, + ConstLabels: map[string]string{"controller_path": parent, "controller_id": id}, Buckets: evaluationTimesBuckets, NativeHistogramBucketFactor: 1.1, NativeHistogramMaxBucketNumber: 100, @@ -60,13 +60,13 @@ func newControllerMetrics(id string) *controllerMetrics { cm.evaluationQueueSize = prometheus.NewGauge(prometheus.GaugeOpts{ Name: "agent_component_evaluation_queue_size", Help: "Tracks the number of components waiting to be evaluated in the worker pool", - ConstLabels: map[string]string{"controller_id": id}, + ConstLabels: map[string]string{"controller_path": parent, "controller_id": id}, }) cm.slowComponentEvaluationTime = prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "agent_component_evaluation_slow_seconds", Help: fmt.Sprintf("Number of seconds spent evaluating components that take longer than %v to evaluate", cm.slowComponentThreshold), - ConstLabels: map[string]string{"controller_id": id}, + ConstLabels: map[string]string{"controller_path": parent, "controller_id": id}, }, []string{"component_id"}) return cm @@ -100,14 +100,14 @@ type controllerCollector struct { runningComponentsTotal *prometheus.Desc } -func newControllerCollector(l *Loader, id string) *controllerCollector { +func newControllerCollector(l *Loader, parent, id string) *controllerCollector { return &controllerCollector{ l: l, runningComponentsTotal: prometheus.NewDesc( "agent_component_controller_running_components", "Total number of running components.", []string{"health_type"}, - map[string]string{"controller_id": id}, + map[string]string{"controller_path": parent, "controller_id": id}, ), } } diff --git a/internal/flow/internal/controller/node_builtin_component.go b/internal/flow/internal/controller/node_builtin_component.go index 028ca2362777..6bf5c6bf36e5 100644 --- a/internal/flow/internal/controller/node_builtin_component.go +++ b/internal/flow/internal/controller/node_builtin_component.go @@ -165,11 +165,13 @@ func NewBuiltinComponentNode(globals ComponentGlobals, reg component.Registratio func getManagedOptions(globals ComponentGlobals, cn *BuiltinComponentNode) component.Options { cn.registry = prometheus.NewRegistry() + parent, id := splitPath(cn.globalID) return component.Options{ ID: cn.globalID, - Logger: log.With(globals.Logger, "component", cn.globalID), + Logger: log.With(globals.Logger, "component_path", parent, "component_id", id), Registerer: prometheus.WrapRegistererWith(prometheus.Labels{ - "component_id": cn.globalID, + "component_path": parent, + "component_id": id, }, cn.registry), Tracer: tracing.WrapTracer(globals.TraceProvider, cn.globalID), diff --git a/internal/flow/internal/controller/node_builtin_component_test.go b/internal/flow/internal/controller/node_builtin_component_test.go index 3be8307dd828..9e777490cd7f 100644 --- a/internal/flow/internal/controller/node_builtin_component_test.go +++ b/internal/flow/internal/controller/node_builtin_component_test.go @@ -37,3 +37,28 @@ func TestLocalID(t *testing.T) { }) require.Equal(t, "/data/local.id", filepath.ToSlash(mo.DataPath)) } + +func TestSplitPath(t *testing.T) { + var testcases = []struct { + input string + path string + id string + }{ + {"", "/", ""}, + {"remotecfg", "/", "remotecfg"}, + {"prometheus.remote_write", "/", "prometheus.remote_write"}, + {"custom_component.default/prometheus.remote_write", "/custom_component.default", "prometheus.remote_write"}, + + {"local.file.default", "/", "local.file.default"}, + {"a_namespace.a.default/local.file.default", "/a_namespace.a.default", "local.file.default"}, + {"a_namespace.a.default/b_namespace.b.default/local.file.default", "/a_namespace.a.default/b_namespace.b.default", "local.file.default"}, + + {"a_namespace.a.default/b_namespace.b.default/c_namespace.c.default", "/a_namespace.a.default/b_namespace.b.default", "c_namespace.c.default"}, + } + + for _, tt := range testcases { + path, id := splitPath(tt.input) + require.Equal(t, tt.path, path) + require.Equal(t, tt.id, id) + } +} diff --git a/internal/flow/internal/controller/node_config_import.go b/internal/flow/internal/controller/node_config_import.go index 7cc95bdb4d1f..e87a1223cc5b 100644 --- a/internal/flow/internal/controller/node_config_import.go +++ b/internal/flow/internal/controller/node_config_import.go @@ -90,11 +90,13 @@ func NewImportConfigNode(block *ast.BlockStmt, globals ComponentGlobals, sourceT func getImportManagedOptions(globals ComponentGlobals, cn *ImportConfigNode) component.Options { cn.registry = prometheus.NewRegistry() + parent, id := splitPath(cn.globalID) return component.Options{ ID: cn.globalID, - Logger: log.With(globals.Logger, "config", cn.globalID), + Logger: log.With(globals.Logger, "config_path", parent, "config_id", id), Registerer: prometheus.WrapRegistererWith(prometheus.Labels{ - "config_id": cn.globalID, + "config_path": parent, + "config_id": id, }, cn.registry), Tracer: tracing.WrapTracer(globals.TraceProvider, cn.globalID), DataPath: filepath.Join(globals.DataPath, cn.globalID), diff --git a/internal/flow/internal/controller/node_custom_component.go b/internal/flow/internal/controller/node_custom_component.go index 8b3e05a74e89..d0d53d804e5c 100644 --- a/internal/flow/internal/controller/node_custom_component.go +++ b/internal/flow/internal/controller/node_custom_component.go @@ -104,6 +104,7 @@ func NewCustomComponentNode(globals ComponentGlobals, b *ast.BlockStmt, getConfi componentName := b.GetBlockName() importNamespace, customComponentName := ExtractImportAndDeclare(componentName) + parent, node := splitPath(globalID) cn := &CustomComponentNode{ id: id, @@ -115,7 +116,7 @@ func NewCustomComponentNode(globals ComponentGlobals, b *ast.BlockStmt, getConfi customComponentName: customComponentName, moduleController: globals.NewModuleController(globalID), OnBlockNodeUpdate: globals.OnBlockNodeUpdate, - logger: log.With(globals.Logger, "component", globalID), + logger: log.With(globals.Logger, "component_path", parent, "component_id", node), getConfig: getConfig, block: b, diff --git a/operations/agent-flow-mixin/alerts/controller.libsonnet b/operations/agent-flow-mixin/alerts/controller.libsonnet index 3aeb5eabbb10..5e3454ecb2c8 100644 --- a/operations/agent-flow-mixin/alerts/controller.libsonnet +++ b/operations/agent-flow-mixin/alerts/controller.libsonnet @@ -6,7 +6,7 @@ alert.newGroup( // Component evaluations are taking too long, which can lead to e.g. stale targets. alert.newRule( 'SlowComponentEvaluations', - 'sum by (cluster, namespace, component_id) (rate(agent_component_evaluation_slow_seconds[10m])) > 0', + 'sum by (cluster, namespace, component_path, component_id) (rate(agent_component_evaluation_slow_seconds[10m])) > 0', 'Flow component evaluations are taking too long.', '15m', ), diff --git a/operations/agent-flow-mixin/dashboards/controller.libsonnet b/operations/agent-flow-mixin/dashboards/controller.libsonnet index ec059de98189..e58aa00e2a47 100644 --- a/operations/agent-flow-mixin/dashboards/controller.libsonnet +++ b/operations/agent-flow-mixin/dashboards/controller.libsonnet @@ -264,10 +264,10 @@ local filename = 'agent-flow-controller.json'; panel.withQueries([ panel.newQuery( expr=||| - sum by (component_id) (rate(agent_component_evaluation_slow_seconds{cluster="$cluster", namespace="$namespace"}[$__rate_interval])) + sum by (component_path, component_id) (rate(agent_component_evaluation_slow_seconds{cluster="$cluster", namespace="$namespace"}[$__rate_interval])) / scalar(sum(rate(agent_component_evaluation_seconds_sum{cluster="$cluster", namespace="$namespace"}[$__rate_interval]))) |||, - legendFormat='{{component_id}}', + legendFormat='{{component path}} {{component_id}}', ), ]) ), diff --git a/operations/agent-flow-mixin/dashboards/prometheus.libsonnet b/operations/agent-flow-mixin/dashboards/prometheus.libsonnet index 21ae79f3b063..d88f41e82662 100644 --- a/operations/agent-flow-mixin/dashboards/prometheus.libsonnet +++ b/operations/agent-flow-mixin/dashboards/prometheus.libsonnet @@ -105,13 +105,13 @@ local remoteWritePanels(y_offset) = [ panel.withQueries([ panel.newQuery( expr=||| - sum by (instance, component_id) ( - prometheus_remote_storage_highest_timestamp_in_seconds{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_id=~"$component"} + sum by (instance, component_path, component_id) ( + prometheus_remote_storage_highest_timestamp_in_seconds{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_path=~"$component_path", component_id=~"$component"} - ignoring(url, remote_name) group_right(instance) - prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_id=~"$component", url=~"$url"} + prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_path=~"$component_path", component_id=~"$component", url=~"$url"} ) |||, - legendFormat='{{instance}} / {{component_id}}', + legendFormat='{{instance}} / {{component_path}} {{component_id}}', ), ]) ), @@ -130,11 +130,11 @@ local remoteWritePanels(y_offset) = [ panel.newQuery( expr=||| sum without (remote_name, url) ( - rate(prometheus_remote_storage_bytes_total{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_id=~"$component", url=~"$url"}[$__rate_interval]) + - rate(prometheus_remote_storage_metadata_bytes_total{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_id=~"$component", url=~"$url"}[$__rate_interval]) + rate(prometheus_remote_storage_bytes_total{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) + + rate(prometheus_remote_storage_metadata_bytes_total{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) ) |||, - legendFormat='{{instance}} / {{component_id}}', + legendFormat='{{instance}} / {{component_path}} {{component_id}}', ), ]) ), @@ -152,7 +152,7 @@ local remoteWritePanels(y_offset) = [ panel.newQuery( expr=||| histogram_quantile(0.99, sum by (le) ( - rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster="$cluster",namespace="$namespace",instance=~"$instance", component_id=~"$component", url=~"$url"}[$__rate_interval]) + rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster="$cluster",namespace="$namespace",instance=~"$instance", component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) )) |||, legendFormat='99th percentile', @@ -160,15 +160,15 @@ local remoteWritePanels(y_offset) = [ panel.newQuery( expr=||| histogram_quantile(0.50, sum by (le) ( - rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster="$cluster",namespace="$namespace",instance=~"$instance", component_id=~"$component", url=~"$url"}[$__rate_interval]) + rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster="$cluster",namespace="$namespace",instance=~"$instance", component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) )) |||, legendFormat='50th percentile', ), panel.newQuery( expr=||| - sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_sum{cluster="$cluster",namespace="$namespace",instance=~"$instance", component_id=~"$component"}[$__rate_interval])) / - sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count{cluster="$cluster",namespace="$namespace",instance=~"$instance", component_id=~"$component"}[$__rate_interval])) + sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_sum{cluster="$cluster",namespace="$namespace",instance=~"$instance", component_path=~"$component_path", component_id=~"$component"}[$__rate_interval])) / + sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count{cluster="$cluster",namespace="$namespace",instance=~"$instance", component_path=~"$component_path", component_id=~"$component"}[$__rate_interval])) |||, legendFormat='Average', ), @@ -223,15 +223,15 @@ local remoteWritePanels(y_offset) = [ panel.newQuery( expr=||| sum without (remote_name, url) ( - prometheus_remote_storage_shards{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_id=~"$component", url=~"$url"} + prometheus_remote_storage_shards{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_path=~"$component_path", component_id=~"$component", url=~"$url"} ) |||, - legendFormat='{{instance}} / {{component_id}}', + legendFormat='{{instance}} / {{component_path}} {{component_id}}', ), panel.newQuery( expr=||| min ( - prometheus_remote_storage_shards_min{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_id=~"$component", url=~"$url"} + prometheus_remote_storage_shards_min{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_path=~"$component_path", component_id=~"$component", url=~"$url"} ) |||, legendFormat='Minimum', @@ -239,7 +239,7 @@ local remoteWritePanels(y_offset) = [ panel.newQuery( expr=||| max ( - prometheus_remote_storage_shards_max{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_id=~"$component", url=~"$url"} + prometheus_remote_storage_shards_max{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_path=~"$component_path", component_id=~"$component", url=~"$url"} ) |||, legendFormat='Maximum', @@ -260,10 +260,10 @@ local remoteWritePanels(y_offset) = [ panel.newQuery( expr=||| sum without (url, remote_name) ( - rate(prometheus_remote_storage_samples_total{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_id=~"$component", url=~"$url"}[$__rate_interval]) + rate(prometheus_remote_storage_samples_total{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) ) |||, - legendFormat='{{instance}} / {{component_id}}', + legendFormat='{{instance}} / {{component_path}} {{component_id}}', ), ]) ), @@ -282,10 +282,10 @@ local remoteWritePanels(y_offset) = [ panel.newQuery( expr=||| sum without (url,remote_name) ( - rate(prometheus_remote_storage_samples_failed_total{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_id=~"$component", url=~"$url"}[$__rate_interval]) + rate(prometheus_remote_storage_samples_failed_total{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) ) |||, - legendFormat='{{instance}} / {{component_id}}', + legendFormat='{{instance}} / {{component_path}} {{component_id}}', ), ]) ), @@ -304,10 +304,10 @@ local remoteWritePanels(y_offset) = [ panel.newQuery( expr=||| sum without (url,remote_name) ( - rate(prometheus_remote_storage_samples_retried_total{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_id=~"$component", url=~"$url"}[$__rate_interval]) + rate(prometheus_remote_storage_samples_retried_total{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) ) |||, - legendFormat='{{instance}} / {{component_id}}', + legendFormat='{{instance}} / {{component_path}} {{component_id}}', ), ]) ), @@ -333,7 +333,7 @@ local remoteWritePanels(y_offset) = [ panel.withQueries([ panel.newQuery( expr=||| - sum(agent_wal_storage_active_series{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_id=~"$component", url=~"$url"}) + sum(agent_wal_storage_active_series{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_path=~"$component_path", component_id=~"$component", url=~"$url"}) |||, legendFormat='Series', ), @@ -356,9 +356,9 @@ local remoteWritePanels(y_offset) = [ panel.withQueries([ panel.newQuery( expr=||| - agent_wal_storage_active_series{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_id!="", component_id=~"$component", url=~"$url"} + agent_wal_storage_active_series{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_id!="", component_path=~"$component_path", component_id=~"$component", url=~"$url"} |||, - legendFormat='{{instance}} / {{component_id}}', + legendFormat='{{instance}} / {{component_path}} {{component_id}}', ), ]) ), @@ -379,9 +379,9 @@ local remoteWritePanels(y_offset) = [ panel.withQueries([ panel.newQuery( expr=||| - sum by (component_id) (agent_wal_storage_active_series{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_id!="", component_id=~"$component", url=~"$url"}) + sum by (component_path, component_id) (agent_wal_storage_active_series{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_id!="", component_path=~"$component_path", component_id=~"$component", url=~"$url"}) |||, - legendFormat='{{component_id}}', + legendFormat='{{component_path}} {{component_id}}', ), ]) ), @@ -406,6 +406,9 @@ local remoteWritePanels(y_offset) = [ dashboard.newMultiTemplateVariable('instance', ||| label_values(agent_component_controller_running_components{cluster="$cluster", namespace="$namespace"}, instance) |||), + dashboard.newMultiTemplateVariable('component_path', ||| + label_values(agent_wal_samples_appended_total{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_id=~"prometheus\\.remote_write\\..*", component_path=~".*"}, component_path) + |||), dashboard.newMultiTemplateVariable('component', ||| label_values(agent_wal_samples_appended_total{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_id=~"prometheus\\.remote_write\\..*"}, component_id) |||),