diff --git a/cortex-mixin/dashboards/alertmanager.libsonnet b/cortex-mixin/dashboards/alertmanager.libsonnet index 52700f3..731135d 100644 --- a/cortex-mixin/dashboards/alertmanager.libsonnet +++ b/cortex-mixin/dashboards/alertmanager.libsonnet @@ -73,7 +73,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.timeseriesPanel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + $.latencyPanel('cortex_alertmanager_notification_latency_seconds', '{%s}' % $.jobMatcher($._config.job_names.alertmanager)) ) ) @@ -84,7 +84,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"api_v1_alerts|alertmanager"}' % $.jobMatcher($._config.job_names.gateway)) ) .addPanel( - $.timeseriesPanel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', 'api_v1_alerts|alertmanager')]) ) ) @@ -166,7 +166,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; } ) .addPanel( - $.timeseriesPanel('Initial sync duration') + + $.timeseriesPanel('Initial sync duration', unit='s') + $.latencyPanel('cortex_alertmanager_state_initial_sync_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.alertmanager)) + { targets: [ target { diff --git a/cortex-mixin/dashboards/compactor.libsonnet b/cortex-mixin/dashboards/compactor.libsonnet index 9578511..720b6ff 100644 --- a/cortex-mixin/dashboards/compactor.libsonnet +++ b/cortex-mixin/dashboards/compactor.libsonnet @@ -14,7 +14,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'sum(rate(cortex_compactor_runs_failed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor) ) + $.bars + - { yaxes: $.yaxes('ops') } + $.panelDescription( 'Per-instance runs', ||| @@ -44,9 +43,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('') .addPanel( - $.timeseriesPanel('Compacted blocks / sec') + + $.timeseriesPanel('Compacted blocks / sec', unit='ops') + $.queryPanel('sum(rate(prometheus_tsdb_compactions_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor), 'blocks') + - { yaxes: $.yaxes('ops') } + $.panelDescription( 'Compacted blocks / sec', ||| @@ -55,7 +53,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.timeseriesPanel('Per-block compaction duration') + + $.timeseriesPanel('Per-block compaction duration', unit='s') + $.latencyPanel('prometheus_tsdb_compaction_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.compactor)) + $.panelDescription( 'Per-block compaction duration', @@ -85,9 +83,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Garbage Collector') .addPanel( - $.timeseriesPanel('Blocks marked for deletion / sec') + - $.queryPanel('sum(rate(cortex_compactor_blocks_marked_for_deletion_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor), 'blocks') + - { yaxes: $.yaxes('ops') }, + $.timeseriesPanel('Blocks marked for deletion / sec', unit='ops') + + $.queryPanel('sum(rate(cortex_compactor_blocks_marked_for_deletion_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor), 'blocks'), ) .addPanel( $.successFailurePanel( @@ -111,7 +108,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) + { yaxes: $.yaxes('ops') } ) .addPanel( - $.timeseriesPanel('Metadata Sync Duration') + + $.timeseriesPanel('Metadata Sync Duration', unit='ms') + // This metric tracks the duration of a per-tenant metadata sync. $.latencyPanel('cortex_compactor_meta_sync_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.compactor)), ) diff --git a/cortex-mixin/dashboards/config.libsonnet b/cortex-mixin/dashboards/config.libsonnet index 74ef0a9..10692a3 100644 --- a/cortex-mixin/dashboards/config.libsonnet +++ b/cortex-mixin/dashboards/config.libsonnet @@ -8,19 +8,17 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Startup config file') .addPanel( - $.timeseriesPanel('Startup config file hashes') + + $.timeseriesPanel('Startup config file hashes', unit='instances') + $.queryPanel('count(cortex_config_hash{%s}) by (sha256)' % $.namespaceMatcher(), 'sha256:{{sha256}}') + - $.stack + - { yaxes: $.yaxes('instances') }, + $.stack, ) ) .addRow( $.row('Runtime config file') .addPanel( - $.timeseriesPanel('Runtime config file hashes') + + $.timeseriesPanel('Runtime config file hashes', unit='instances') + $.queryPanel('count(cortex_runtime_config_hash{%s}) by (sha256)' % $.namespaceMatcher(), 'sha256:{{sha256}}') + - $.stack + - { yaxes: $.yaxes('instances') }, + $.stack, ) ), } diff --git a/cortex-mixin/dashboards/dashboard-utils.libsonnet b/cortex-mixin/dashboards/dashboard-utils.libsonnet index f525f45..3d9eea3 100644 --- a/cortex-mixin/dashboards/dashboard-utils.libsonnet +++ b/cortex-mixin/dashboards/dashboard-utils.libsonnet @@ -62,6 +62,44 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addTemplate('cluster', 'cortex_build_info', 'cluster') .addTemplate('namespace', 'cortex_build_info{cluster=~"$cluster"}', 'namespace'), }, + timeseriesPanel(title, unit='short'):: { + datasource: '$datasource', + fieldConfig: { + defaults: { + custom: { + drawStyle: 'line', + fillOpacity: 1, + lineWidth: 1, + pointSize: 5, + showPoints: 'never', + spanNulls: false, + stacking: { + group: 'A', + mode: 'none', + }, + }, + thresholds: { + mode: 'absolute', + steps: [], + }, + unit: unit, + }, + overrides: [], + }, + options: { + legend: { + showLegend: true, + }, + tooltip: { + mode: 'single', + sort: 'none', + }, + }, + links: [], + targets: [], + title: title, + type: 'timeseries', + }, // The mixin allow specialism of the job selector depending on if its a single binary // deployment or a namespaced one. @@ -108,6 +146,35 @@ local utils = import 'mixin-utils/utils.libsonnet'; } for target in super.targets ], + fieldConfig+: { + defaults+: { + custom+: { + lineWidth: 0, + fillOpacity: 100, // Get solid fill. + stacking: { + mode: 'normal', + group: 'A', + }, + }, + unit: 'reqps', + min: 0, + }, + overrides+: [{ + matcher: { + id: 'byName', + options: status, + }, + properties: [ + { + id: 'color', + value: { + mode: 'fixed', + fixedColor: $.httpStatusColors[status], + }, + }, + ], + } for status in std.objectFieldsAll($.httpStatusColors)], + }, }, latencyPanel(metricName, selector, multiplier='1e3'):: @@ -121,7 +188,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, successFailurePanel(title, successMetric, failureMetric):: - $.timeseriesPanel(title) + + $.timeseriesPanel(title, unit='short') + $.queryPanel([successMetric, failureMetric], ['successful', 'failed']) + $.stack + { aliasColors: { @@ -132,7 +199,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; // Displays started, completed and failed rate. startedCompletedFailedPanel(title, startedMetric, completedMetric, failedMetric):: - $.timeseriesPanel(title) + + $.timeseriesPanel(title, unit='ops') + $.queryPanel([startedMetric, completedMetric, failedMetric], ['started', 'completed', 'failed']) + $.stack + { aliasColors: { @@ -160,7 +227,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, containerMemoryWorkingSetPanel(title, containerName):: - $.timeseriesPanel(title) + + $.timeseriesPanel(title, unit='bytes') + $.queryPanel([ // We use "max" instead of "sum" otherwise during a rolling update of a statefulset we will end up // summing the memory of the old instance/pod (whose metric will be stale for 5m) to the new instance/pod. @@ -180,7 +247,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, containerNetworkPanel(title, metric, instanceName):: - $.timeseriesPanel(title) + + $.timeseriesPanel(title, unit='Bps') + $.queryPanel( 'sum by(%(instance)s) (rate(%(metric)s{%(namespace)s,%(instance)s=~"%(instanceName)s"}[$__rate_interval]))' % { namespace: $.namespaceMatcher(), @@ -199,7 +266,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.containerNetworkPanel('Transmit Bandwidth', 'container_network_transmit_bytes_total', instanceName), containerDiskWritesPanel(title, containerName):: - $.timeseriesPanel(title) + + $.timeseriesPanel(title, unit='Bps') + $.queryPanel( ||| sum by(%s, %s, device) ( @@ -220,7 +287,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; { yaxes: $.yaxes('Bps') }, containerDiskReadsPanel(title, containerName):: - $.timeseriesPanel(title) + + $.timeseriesPanel(title, unit='Bps') + $.queryPanel( ||| sum by(%s, %s, device) ( @@ -239,7 +306,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; { yaxes: $.yaxes('Bps') }, containerDiskSpaceUtilization(title, containerName):: - $.timeseriesPanel(title) + + $.timeseriesPanel(title, unit='percentunit') + $.queryPanel( ||| max by(persistentvolumeclaim) ( @@ -266,7 +333,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; else 'label_name="%s"' % containerName, goHeapInUsePanel(title, jobName):: - $.timeseriesPanel(title) + + $.timeseriesPanel(title, unit='bytes') + $.queryPanel( 'sum by(%s) (go_memstats_heap_inuse_bytes{%s})' % [$._config.per_instance_label, $.jobMatcher(jobName)], '{{%s}}' % $._config.per_instance_label @@ -361,13 +428,12 @@ local utils = import 'mixin-utils/utils.libsonnet'; getObjectStoreRows(title, component):: [ super.row(title) .addPanel( - $.timeseriesPanel('Operations / sec') + + $.timeseriesPanel('Operations / sec', unit='rps') + $.queryPanel('sum by(operation) (rate(thanos_objstore_bucket_operations_total{%s,component="%s"}[$__rate_interval]))' % [$.namespaceMatcher(), component], '{{operation}}') + - $.stack + - { yaxes: $.yaxes('rps') }, + $.stack ) .addPanel( - $.timeseriesPanel('Error rate') + + $.timeseriesPanel('Error rate', unit='percentunit') + $.queryPanel('sum by(operation) (rate(thanos_objstore_bucket_operation_failures_total{%s,component="%s"}[$__rate_interval])) / sum by(operation) (rate(thanos_objstore_bucket_operations_total{%s,component="%s"}[$__rate_interval]))' % [$.namespaceMatcher(), component, $.namespaceMatcher(), component], '{{operation}}') + { yaxes: $.yaxes('percentunit') }, ) @@ -406,7 +472,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; }; super.row(title) .addPanel( - $.timeseriesPanel('Requests / sec') + + $.timeseriesPanel('Requests / sec', unit='ops') + $.queryPanel( ||| sum by(operation) ( @@ -439,7 +505,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.timeseriesPanel('Hit ratio') + + $.timeseriesPanel('Hit ratio', unit='percentunit') + $.queryPanel( ||| sum( diff --git a/cortex-mixin/dashboards/object-store.libsonnet b/cortex-mixin/dashboards/object-store.libsonnet index 0fc1824..d58976a 100644 --- a/cortex-mixin/dashboards/object-store.libsonnet +++ b/cortex-mixin/dashboards/object-store.libsonnet @@ -7,29 +7,25 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Components') .addPanel( - $.timeseriesPanel('RPS / component') + + $.timeseriesPanel('RPS / component', unit='rps') + $.queryPanel('sum by(component) (rate(thanos_objstore_bucket_operations_total{%s}[$__rate_interval]))' % $.namespaceMatcher(), '{{component}}') + - $.stack + - { yaxes: $.yaxes('rps') }, + $.stack, ) .addPanel( - $.timeseriesPanel('Error rate / component') + - $.queryPanel('sum by(component) (rate(thanos_objstore_bucket_operation_failures_total{%s}[$__rate_interval])) / sum by(component) (rate(thanos_objstore_bucket_operations_total{%s}[$__rate_interval]))' % [$.namespaceMatcher(), $.namespaceMatcher()], '{{component}}') + - { yaxes: $.yaxes('percentunit') }, + $.timeseriesPanel('Error rate / component', unit='percentunit') + + $.queryPanel('sum by(component) (rate(thanos_objstore_bucket_operation_failures_total{%s}[$__rate_interval])) / sum by(component) (rate(thanos_objstore_bucket_operations_total{%s}[$__rate_interval]))' % [$.namespaceMatcher(), $.namespaceMatcher()], '{{component}}') ) ) .addRow( $.row('Operations') .addPanel( - $.timeseriesPanel('RPS / operation') + + $.timeseriesPanel('RPS / operation', unit='rps') + $.queryPanel('sum by(operation) (rate(thanos_objstore_bucket_operations_total{%s}[$__rate_interval]))' % $.namespaceMatcher(), '{{operation}}') + - $.stack + - { yaxes: $.yaxes('rps') }, + $.stack, ) .addPanel( - $.timeseriesPanel('Error rate / operation') + - $.queryPanel('sum by(operation) (rate(thanos_objstore_bucket_operation_failures_total{%s}[$__rate_interval])) / sum by(operation) (rate(thanos_objstore_bucket_operations_total{%s}[$__rate_interval]))' % [$.namespaceMatcher(), $.namespaceMatcher()], '{{operation}}') + - { yaxes: $.yaxes('percentunit') }, + $.timeseriesPanel('Error rate / operation', unit='percentunit') + + $.queryPanel('sum by(operation) (rate(thanos_objstore_bucket_operation_failures_total{%s}[$__rate_interval])) / sum by(operation) (rate(thanos_objstore_bucket_operations_total{%s}[$__rate_interval]))' % [$.namespaceMatcher(), $.namespaceMatcher()], '{{operation}}') ) ) .addRow( diff --git a/cortex-mixin/dashboards/queries.libsonnet b/cortex-mixin/dashboards/queries.libsonnet index 04b30ea..212ab9d 100644 --- a/cortex-mixin/dashboards/queries.libsonnet +++ b/cortex-mixin/dashboards/queries.libsonnet @@ -8,13 +8,12 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Query Frontend') .addPanel( - $.timeseriesPanel('Queue Duration') + + $.timeseriesPanel('Queue Duration', unit='ms') + $.latencyPanel('cortex_query_frontend_queue_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.query_frontend)), ) .addPanel( - $.timeseriesPanel('Retries') + - $.latencyPanel('cortex_query_frontend_retries', '{%s}' % $.jobMatcher($._config.job_names.query_frontend), multiplier=1) + - { yaxes: $.yaxes('short') }, + $.timeseriesPanel('Retries', unit='short') + + $.latencyPanel('cortex_query_frontend_retries', '{%s}' % $.jobMatcher($._config.job_names.query_frontend), multiplier=1), ) .addPanel( $.timeseriesPanel('Queue Length') + @@ -24,7 +23,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Query Scheduler') .addPanel( - $.timeseriesPanel('Queue Duration') + + $.timeseriesPanel('Queue Duration', unit='ms') + $.latencyPanel('cortex_query_scheduler_queue_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.query_scheduler)), ) .addPanel( @@ -78,9 +77,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.timeseriesPanel('Number of Sharded Queries per Query') + + $.timeseriesPanel('Number of Sharded Queries per Query', unit='short') + $.latencyPanel('cortex_frontend_sharded_queries_per_query', '{%s}' % $.jobMatcher($._config.job_names.query_frontend), multiplier=1) + - { yaxes: $.yaxes('short') } + $.panelDescription( 'Number of Sharded Queries per Query', ||| @@ -93,9 +91,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Querier') .addPanel( - $.timeseriesPanel('Stages') + + $.timeseriesPanel('Stages', unit='ms') + $.queryPanel('max by (slice) (prometheus_engine_query_duration_seconds{quantile="0.9",%s}) * 1e3' % $.jobMatcher($._config.job_names.querier), '{{slice}}') + - { yaxes: $.yaxes('ms') } + $.stack, ) .addPanel( @@ -113,33 +110,28 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Ingester') .addPanel( - $.timeseriesPanel('Series per Query') + - utils.latencyRecordingRulePanel('cortex_ingester_queried_series', $.jobSelector($._config.job_names.ingester), multiplier=1) + - { yaxes: $.yaxes('short') }, + $.timeseriesPanel('Series per Query', unit='short') + + utils.latencyRecordingRulePanel('cortex_ingester_queried_series', $.jobSelector($._config.job_names.ingester), multiplier=1), ) .addPanel( - $.timeseriesPanel('Chunks per Query') + - utils.latencyRecordingRulePanel('cortex_ingester_queried_chunks', $.jobSelector($._config.job_names.ingester), multiplier=1) + - { yaxes: $.yaxes('short') }, + $.timeseriesPanel('Chunks per Query', unit='short') + + utils.latencyRecordingRulePanel('cortex_ingester_queried_chunks', $.jobSelector($._config.job_names.ingester), multiplier=1), ) .addPanel( - $.timeseriesPanel('Samples per Query') + - utils.latencyRecordingRulePanel('cortex_ingester_queried_samples', $.jobSelector($._config.job_names.ingester), multiplier=1) + - { yaxes: $.yaxes('short') }, + $.timeseriesPanel('Samples per Query', unit='short') + + utils.latencyRecordingRulePanel('cortex_ingester_queried_samples', $.jobSelector($._config.job_names.ingester), multiplier=1), ) ) .addRowIf( std.member($._config.storage_engine, 'blocks'), $.row('Querier - Blocks storage') .addPanel( - $.timeseriesPanel('Number of store-gateways hit per Query') + - $.latencyPanel('cortex_querier_storegateway_instances_hit_per_query', '{%s}' % $.jobMatcher($._config.job_names.querier), multiplier=1) + - { yaxes: $.yaxes('short') }, + $.timeseriesPanel('Number of store-gateways hit per Query', unit='short') + + $.latencyPanel('cortex_querier_storegateway_instances_hit_per_query', '{%s}' % $.jobMatcher($._config.job_names.querier), multiplier=1), ) .addPanel( - $.timeseriesPanel('Refetches of missing blocks per Query') + - $.latencyPanel('cortex_querier_storegateway_refetches_per_query', '{%s}' % $.jobMatcher($._config.job_names.querier), multiplier=1) + - { yaxes: $.yaxes('short') }, + $.timeseriesPanel('Refetches of missing blocks per Query', unit='short') + + $.latencyPanel('cortex_querier_storegateway_refetches_per_query', '{%s}' % $.jobMatcher($._config.job_names.querier), multiplier=1), ) .addPanel( $.timeseriesPanel('Consistency checks failed') + @@ -151,13 +143,12 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.storage_engine, 'blocks'), $.row('') .addPanel( - $.timeseriesPanel('Bucket indexes loaded (per querier)') + + $.timeseriesPanel('Bucket indexes loaded (per querier)', unit='short') + $.queryPanel([ 'max(cortex_bucket_index_loaded{%s})' % $.jobMatcher($._config.job_names.querier), 'min(cortex_bucket_index_loaded{%s})' % $.jobMatcher($._config.job_names.querier), 'avg(cortex_bucket_index_loaded{%s})' % $.jobMatcher($._config.job_names.querier), - ], ['Max', 'Min', 'Average']) + - { yaxes: $.yaxes('short') }, + ], ['Max', 'Min', 'Average']), ) .addPanel( $.successFailurePanel( @@ -167,7 +158,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.timeseriesPanel('Bucket indexes load latency') + + $.timeseriesPanel('Bucket indexes load latency', unit='ms') + $.latencyPanel('cortex_bucket_index_load_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.querier)), ) ) @@ -175,21 +166,18 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.storage_engine, 'blocks'), $.row('Store-gateway - Blocks storage') .addPanel( - $.timeseriesPanel('Blocks queried / sec') + - $.queryPanel('sum(rate(cortex_bucket_store_series_blocks_queried_sum{component="store-gateway",%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.store_gateway), 'blocks') + - { yaxes: $.yaxes('ops') }, + $.timeseriesPanel('Blocks queried / sec', unit='ops') + + $.queryPanel('sum(rate(cortex_bucket_store_series_blocks_queried_sum{component="store-gateway",%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.store_gateway), 'blocks'), ) .addPanel( - $.timeseriesPanel('Data fetched / sec') + + $.timeseriesPanel('Data fetched / sec', unit='ops') + $.queryPanel('sum by(data_type) (rate(cortex_bucket_store_series_data_fetched_sum{component="store-gateway",%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.store_gateway), '{{data_type}}') + - $.stack + - { yaxes: $.yaxes('ops') }, + $.stack, ) .addPanel( - $.timeseriesPanel('Data touched / sec') + + $.timeseriesPanel('Data touched / sec', unit='ops') + $.queryPanel('sum by(data_type) (rate(cortex_bucket_store_series_data_touched_sum{component="store-gateway",%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.store_gateway), '{{data_type}}') + - $.stack + - { yaxes: $.yaxes('ops') }, + $.stack, ) ) .addRowIf( @@ -238,7 +226,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.queryPanel('cortex_bucket_store_indexheader_lazy_load_total{%s} - cortex_bucket_store_indexheader_lazy_unload_total{%s}' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], '{{%s}}' % $._config.per_instance_label) ) .addPanel( - $.timeseriesPanel('Index-header lazy load duration') + + $.timeseriesPanel('Index-header lazy load duration', unit='ms') + $.latencyPanel('cortex_bucket_store_indexheader_lazy_load_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.store_gateway)), ) .addPanel( diff --git a/cortex-mixin/dashboards/reads.libsonnet b/cortex-mixin/dashboards/reads.libsonnet index cd60aaf..c0ddbe4 100644 --- a/cortex-mixin/dashboards/reads.libsonnet +++ b/cortex-mixin/dashboards/reads.libsonnet @@ -96,15 +96,14 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.gateway)) ) .addPanel( - $.timeseriesPanel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) ) .addPanel( - $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label, unit='s') + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.gateway)], '' - ) + - { yaxes: $.yaxes('s') } + ) ) ) .addRow( @@ -114,15 +113,14 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.query_frontend)) ) .addPanel( - $.timeseriesPanel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) ) .addPanel( - $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label, unit='s') + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.query_frontend)], '' - ) + - { yaxes: $.yaxes('s') } + ) ) ) .addRow( @@ -157,7 +155,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.qpsPanel('cortex_cache_request_duration_seconds_count{method=~"frontend.+", %s}' % $.jobMatcher($._config.job_names.query_frontend)) ) .addPanel( - $.timeseriesPanel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('method', 'frontend.+')]) ) ) @@ -168,15 +166,14 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.qpsPanel('cortex_querier_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.querier)) ) .addPanel( - $.timeseriesPanel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_querier_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) ) .addPanel( - $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label, unit='s') + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_querier_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.querier)], '' - ) + - { yaxes: $.yaxes('s') } + ) ) ) .addRow( @@ -186,15 +183,14 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( - $.timeseriesPanel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.re('route', '/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata')]) ) .addPanel( - $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label, unit='s') + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ingester)], '' - ) + - { yaxes: $.yaxes('s') } + ) ) ) .addRowIf( @@ -205,22 +201,21 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/gatewaypb.StoreGateway/.*"}' % $.jobMatcher($._config.job_names.store_gateway)) ) .addPanel( - $.timeseriesPanel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.store_gateway) + [utils.selector.re('route', '/gatewaypb.StoreGateway/.*')]) ) .addPanel( - $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label, unit='s') + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/gatewaypb.StoreGateway/.*"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.store_gateway)], '' - ) + - { yaxes: $.yaxes('s') } + ) ) ) .addRowIf( std.member($._config.storage_engine, 'blocks'), $.row('Memcached – Blocks storage – Block index cache (store-gateway accesses)') // Resembles thanosMemcachedCache .addPanel( - $.timeseriesPanel('Requests / sec') + + $.timeseriesPanel('Requests / sec', unit='ops') + $.queryPanel( ||| sum by(operation) ( @@ -234,8 +229,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ||| % $.jobMatcher($._config.job_names.store_gateway), '{{operation}}' ) + - $.stack + - { yaxes: $.yaxes('ops') }, + $.stack, ) .addPanel( $.timeseriesPanel('Latency (getmulti)') + diff --git a/cortex-mixin/dashboards/rollout-progress.libsonnet b/cortex-mixin/dashboards/rollout-progress.libsonnet index baae9fe..775a199 100644 --- a/cortex-mixin/dashboards/rollout-progress.libsonnet +++ b/cortex-mixin/dashboards/rollout-progress.libsonnet @@ -125,7 +125,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; gridPos: { h: 4, w: 2, x: 14, y: 0 }, }, - $.timeseriesPanel('Writes 99th Latency') + + $.timeseriesPanel('Writes 99th latency', unit='s') + $.newStatPanel(||| histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"})) ||| % config, unit='s', thresholds=[ @@ -176,7 +176,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; gridPos: { h: 4, w: 2, x: 14, y: 4 }, }, - $.timeseriesPanel('Reads 99th Latency') + + $.timeseriesPanel('Reads 99th latency', unit='s') + $.newStatPanel(||| histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"})) ||| % config, unit='s', thresholds=[ diff --git a/cortex-mixin/dashboards/ruler.libsonnet b/cortex-mixin/dashboards/ruler.libsonnet index a63bf96..88742e2 100644 --- a/cortex-mixin/dashboards/ruler.libsonnet +++ b/cortex-mixin/dashboards/ruler.libsonnet @@ -96,7 +96,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.timeseriesPanel('Latency') + + $.timeseriesPanel('Latency', unit='s') + $.queryPanel( $.rulerQueries.ruleEvaluations.latency % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], 'average' @@ -110,16 +110,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"%s"}' % [$.jobMatcher($._config.job_names.gateway), ruler_config_api_routes_re]) ) .addPanel( - $.timeseriesPanel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', ruler_config_api_routes_re)]) ) .addPanel( - $.timeseriesPanel('Per route p99 Latency') + + $.timeseriesPanel('Per route p99 latency', unit='s') + $.queryPanel( 'histogram_quantile(0.99, sum by (route, le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%s, route=~"%s"}))' % [$.jobMatcher($._config.job_names.gateway), ruler_config_api_routes_re], '{{ route }}' - ) + - { yaxes: $.yaxes('s') } + ) ) ) .addRow( @@ -129,7 +128,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.qpsPanel('cortex_ingester_client_request_duration_seconds_count{%s, operation="/cortex.Ingester/Push"}' % $.jobMatcher($._config.job_names.ruler)) ) .addPanel( - $.timeseriesPanel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + $.latencyPanel('cortex_ingester_client_request_duration_seconds', '{%s, operation="/cortex.Ingester/Push"}' % $.jobMatcher($._config.job_names.ruler)) ) ) @@ -140,7 +139,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.qpsPanel('cortex_ingester_client_request_duration_seconds_count{%s, operation="/cortex.Ingester/QueryStream"}' % $.jobMatcher($._config.job_names.ruler)) ) .addPanel( - $.timeseriesPanel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + $.latencyPanel('cortex_ingester_client_request_duration_seconds', '{%s, operation="/cortex.Ingester/QueryStream"}' % $.jobMatcher($._config.job_names.ruler)) ) ) @@ -148,14 +147,12 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.storage_engine, 'blocks'), $.row('Ruler - Blocks storage') .addPanel( - $.timeseriesPanel('Number of store-gateways hit per Query') + - $.latencyPanel('cortex_querier_storegateway_instances_hit_per_query', '{%s}' % $.jobMatcher($._config.job_names.ruler), multiplier=1) + - { yaxes: $.yaxes('short') }, + $.timeseriesPanel('Number of store-gateways hit per Query', unit='short') + + $.latencyPanel('cortex_querier_storegateway_instances_hit_per_query', '{%s}' % $.jobMatcher($._config.job_names.ruler), multiplier=1), ) .addPanel( - $.timeseriesPanel('Refetches of missing blocks per Query') + - $.latencyPanel('cortex_querier_storegateway_refetches_per_query', '{%s}' % $.jobMatcher($._config.job_names.ruler), multiplier=1) + - { yaxes: $.yaxes('short') }, + $.timeseriesPanel('Refetches of missing blocks per Query', unit='short') + + $.latencyPanel('cortex_querier_storegateway_refetches_per_query', '{%s}' % $.jobMatcher($._config.job_names.ruler), multiplier=1), ) .addPanel( $.timeseriesPanel('Consistency checks failed') + @@ -185,7 +182,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.queryPanel($.rulerQueries.groupEvaluations.missedIterations % $.jobMatcher($._config.job_names.ruler), '{{ user }}'), ) .addPanel( - $.timeseriesPanel('Latency') + + $.timeseriesPanel('Latency', unit='s') + $.queryPanel( $.rulerQueries.groupEvaluations.latency % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], '{{ user }}' @@ -201,7 +198,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( ($.row('Rule Evaluation per User') + { collapse: true }) .addPanel( - $.timeseriesPanel('Latency') + + $.timeseriesPanel('Latency', unit='s') + $.queryPanel( $.rulerQueries.perUserPerGroupEvaluations.latency % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], '{{ user }}' diff --git a/cortex-mixin/dashboards/writes.libsonnet b/cortex-mixin/dashboards/writes.libsonnet index 81ffc43..67d1058 100644 --- a/cortex-mixin/dashboards/writes.libsonnet +++ b/cortex-mixin/dashboards/writes.libsonnet @@ -71,15 +71,14 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.gateway)) ) .addPanel( - $.timeseriesPanel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', 'api_(v1|prom)_push')]) ) .addPanel( - $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label, unit='s') + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"api_(v1|prom)_push"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.gateway)], '' - ) + - { yaxes: $.yaxes('s') } + ) ) ) .addRow( @@ -89,15 +88,14 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.distributor)) ) .addPanel( - $.timeseriesPanel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.distributor) + [utils.selector.re('route', '/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push')]) ) .addPanel( - $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label, unit='s') + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.distributor)], '' - ) + - { yaxes: $.yaxes('s') } + ) ) ) .addRow( @@ -107,7 +105,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.distributor)) ) .addPanel( - $.timeseriesPanel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.distributor)) ) ) @@ -118,15 +116,14 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.qpsPanel('cortex_request_duration_seconds_count{%s,route="/cortex.Ingester/Push"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( - $.timeseriesPanel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.eq('route', '/cortex.Ingester/Push')]) ) .addPanel( - $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label, unit='s') + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route="/cortex.Ingester/Push"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ingester)], '' - ) + - { yaxes: $.yaxes('s') } + ) ) ) .addRow( @@ -136,7 +133,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( - $.timeseriesPanel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.ingester)) ) ) @@ -158,7 +155,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.timeseriesPanel('Upload latency') + + $.timeseriesPanel('Upload latency', unit='ms') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="ingester",operation="upload"}' % $.jobMatcher($._config.job_names.ingester)) + $.panelDescription( 'Upload latency', @@ -188,7 +185,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.timeseriesPanel('Compactions latency') + + $.timeseriesPanel('Compactions latency', unit='ms') + $.latencyPanel('cortex_ingester_tsdb_compaction_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.ingester)) + $.panelDescription( 'Compaction latency', @@ -231,9 +228,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.timeseriesPanel('WAL truncations latency (includes checkpointing)') + + $.timeseriesPanel('WAL truncations latency (includes checkpointing)', unit='s') + $.queryPanel('sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_sum{%s}[$__rate_interval])) / sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_count{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], 'avg') + - { yaxes: $.yaxes('s') } + $.panelDescription( 'WAL truncations latency (including checkpointing)', ||| @@ -243,7 +239,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.timeseriesPanel('Corruptions / sec') + + $.timeseriesPanel('Corruptions / sec', unit='ops') + $.queryPanel([ 'sum(rate(cortex_ingester_wal_corruptions_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester), 'sum(rate(cortex_ingester_tsdb_mmap_chunk_corruptions_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester),