diff --git a/cortex-mixin/dashboards/alertmanager.libsonnet b/cortex-mixin/dashboards/alertmanager.libsonnet index 0bf88c4..52700f3 100644 --- a/cortex-mixin/dashboards/alertmanager.libsonnet +++ b/cortex-mixin/dashboards/alertmanager.libsonnet @@ -10,22 +10,22 @@ local utils = import 'mixin-utils/utils.libsonnet'; showTitle: false, }) .addPanel( - $.panel('Total Alerts') + + $.timeseriesPanel('Total Alerts') + $.statPanel('sum(cluster_job_%s:cortex_alertmanager_alerts:sum{%s})' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)], format='short') ) .addPanel( - $.panel('Total Silences') + + $.timeseriesPanel('Total Silences') + $.statPanel('sum(cluster_job_%s:cortex_alertmanager_silences:sum{%s})' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)], format='short') ) .addPanel( - $.panel('Tenants') + + $.timeseriesPanel('Tenants') + $.statPanel('max(cortex_alertmanager_tenants_discovered{%s})' % $.jobMatcher($._config.job_names.alertmanager), format='short') ) ) .addRow( $.row('Alerts Received') .addPanel( - $.panel('APS') + + $.timeseriesPanel('APS') + $.queryPanel( [ ||| @@ -42,7 +42,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Alert Notifications') .addPanel( - $.panel('NPS') + + $.timeseriesPanel('NPS') + $.queryPanel( [ ||| @@ -56,7 +56,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('NPS by integration') + + $.timeseriesPanel('NPS by integration') + $.queryPanel( [ ||| @@ -73,18 +73,18 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency') + $.latencyPanel('cortex_alertmanager_notification_latency_seconds', '{%s}' % $.jobMatcher($._config.job_names.alertmanager)) ) ) .addRow( $.row('Configuration API (gateway) + Alertmanager UI') .addPanel( - $.panel('QPS') + + $.timeseriesPanel('QPS') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"api_v1_alerts|alertmanager"}' % $.jobMatcher($._config.job_names.gateway)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', 'api_v1_alerts|alertmanager')]) ) ) @@ -94,7 +94,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Replication') .addPanel( - $.panel('Per %s Tenants' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s Tenants' % $._config.per_instance_label) + $.queryPanel( 'max by(%s) (cortex_alertmanager_tenants_owned{%s})' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)], '{{%s}}' % $._config.per_instance_label @@ -102,7 +102,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.stack ) .addPanel( - $.panel('Per %s Alerts' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s Alerts' % $._config.per_instance_label) + $.queryPanel( 'sum by(%s) (cluster_job_%s:cortex_alertmanager_alerts:sum{%s})' % [$._config.per_instance_label, $._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)], '{{%s}}' % $._config.per_instance_label @@ -110,7 +110,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.stack ) .addPanel( - $.panel('Per %s Silences' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s Silences' % $._config.per_instance_label) + $.queryPanel( 'sum by(%s) (cluster_job_%s:cortex_alertmanager_silences:sum{%s})' % [$._config.per_instance_label, $._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)], '{{%s}}' % $._config.per_instance_label @@ -121,7 +121,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Tenant Configuration Sync') .addPanel( - $.panel('Syncs/sec') + + $.timeseriesPanel('Syncs/sec') + $.queryPanel( [ ||| @@ -135,14 +135,14 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('Syncs/sec (By Reason)') + + $.timeseriesPanel('Syncs/sec (By Reason)') + $.queryPanel( 'sum by(reason) (rate(cortex_alertmanager_sync_configs_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.alertmanager), '{{reason}}' ) ) .addPanel( - $.panel('Ring Check Errors/sec') + + $.timeseriesPanel('Ring Check Errors/sec') + $.queryPanel( 'sum (rate(cortex_alertmanager_ring_check_errors_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.alertmanager), 'errors' @@ -152,7 +152,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Sharding Initial State Sync') .addPanel( - $.panel('Initial syncs /sec') + + $.timeseriesPanel('Initial syncs /sec') + $.queryPanel( 'sum by(outcome) (rate(cortex_alertmanager_state_initial_sync_completed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.alertmanager), '{{outcome}}' @@ -166,7 +166,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; } ) .addPanel( - $.panel('Initial sync duration') + + $.timeseriesPanel('Initial sync duration') + $.latencyPanel('cortex_alertmanager_state_initial_sync_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.alertmanager)) + { targets: [ target { @@ -177,7 +177,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; } ) .addPanel( - $.panel('Fetch state from other alertmanagers /sec') + + $.timeseriesPanel('Fetch state from other alertmanagers /sec') + $.queryPanel( [ ||| @@ -201,7 +201,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Sharding Runtime State Sync') .addPanel( - $.panel('Replicate state to other alertmanagers /sec') + + $.timeseriesPanel('Replicate state to other alertmanagers /sec') + $.queryPanel( [ ||| @@ -215,7 +215,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('Merge state from other alertmanagers /sec') + + $.timeseriesPanel('Merge state from other alertmanagers /sec') + $.queryPanel( [ ||| @@ -229,7 +229,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('Persist state to remote storage /sec') + + $.timeseriesPanel('Persist state to remote storage /sec') + $.queryPanel( [ ||| diff --git a/cortex-mixin/dashboards/compactor.libsonnet b/cortex-mixin/dashboards/compactor.libsonnet index aeb6449..9578511 100644 --- a/cortex-mixin/dashboards/compactor.libsonnet +++ b/cortex-mixin/dashboards/compactor.libsonnet @@ -23,7 +23,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('Tenants compaction progress') + + $.timeseriesPanel('Tenants compaction progress') + $.queryPanel(||| ( cortex_compactor_tenants_processing_succeeded{%s} + @@ -44,7 +44,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('') .addPanel( - $.panel('Compacted blocks / sec') + + $.timeseriesPanel('Compacted blocks / sec') + $.queryPanel('sum(rate(prometheus_tsdb_compactions_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor), 'blocks') + { yaxes: $.yaxes('ops') } + $.panelDescription( @@ -55,7 +55,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('Per-block compaction duration') + + $.timeseriesPanel('Per-block compaction duration') + $.latencyPanel('prometheus_tsdb_compaction_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.compactor)) + $.panelDescription( 'Per-block compaction duration', @@ -68,11 +68,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('') .addPanel( - $.panel('Average blocks / tenant') + + $.timeseriesPanel('Average blocks / tenant') + $.queryPanel('avg(max by(user) (cortex_bucket_blocks_count{%s}))' % $.jobMatcher($._config.job_names.compactor), 'avg'), ) .addPanel( - $.panel('Tenants with largest number of blocks') + + $.timeseriesPanel('Tenants with largest number of blocks') + $.queryPanel('topk(10, max by(user) (cortex_bucket_blocks_count{%s}))' % $.jobMatcher($._config.job_names.compactor), '{{user}}') + $.panelDescription( 'Tenants with largest number of blocks', @@ -85,7 +85,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Garbage Collector') .addPanel( - $.panel('Blocks marked for deletion / sec') + + $.timeseriesPanel('Blocks marked for deletion / sec') + $.queryPanel('sum(rate(cortex_compactor_blocks_marked_for_deletion_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor), 'blocks') + { yaxes: $.yaxes('ops') }, ) @@ -111,7 +111,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) + { yaxes: $.yaxes('ops') } ) .addPanel( - $.panel('Metadata Sync Duration') + + $.timeseriesPanel('Metadata Sync Duration') + // This metric tracks the duration of a per-tenant metadata sync. $.latencyPanel('cortex_compactor_meta_sync_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.compactor)), ) diff --git a/cortex-mixin/dashboards/config.libsonnet b/cortex-mixin/dashboards/config.libsonnet index 9240ef8..74ef0a9 100644 --- a/cortex-mixin/dashboards/config.libsonnet +++ b/cortex-mixin/dashboards/config.libsonnet @@ -8,7 +8,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Startup config file') .addPanel( - $.panel('Startup config file hashes') + + $.timeseriesPanel('Startup config file hashes') + $.queryPanel('count(cortex_config_hash{%s}) by (sha256)' % $.namespaceMatcher(), 'sha256:{{sha256}}') + $.stack + { yaxes: $.yaxes('instances') }, @@ -17,7 +17,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Runtime config file') .addPanel( - $.panel('Runtime config file hashes') + + $.timeseriesPanel('Runtime config file hashes') + $.queryPanel('count(cortex_runtime_config_hash{%s}) by (sha256)' % $.namespaceMatcher(), 'sha256:{{sha256}}') + $.stack + { yaxes: $.yaxes('instances') }, diff --git a/cortex-mixin/dashboards/dashboard-utils.libsonnet b/cortex-mixin/dashboards/dashboard-utils.libsonnet index c0d2b08..f525f45 100644 --- a/cortex-mixin/dashboards/dashboard-utils.libsonnet +++ b/cortex-mixin/dashboards/dashboard-utils.libsonnet @@ -121,7 +121,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, successFailurePanel(title, successMetric, failureMetric):: - $.panel(title) + + $.timeseriesPanel(title) + $.queryPanel([successMetric, failureMetric], ['successful', 'failed']) + $.stack + { aliasColors: { @@ -132,7 +132,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; // Displays started, completed and failed rate. startedCompletedFailedPanel(title, startedMetric, completedMetric, failedMetric):: - $.panel(title) + + $.timeseriesPanel(title) + $.queryPanel([startedMetric, completedMetric, failedMetric], ['started', 'completed', 'failed']) + $.stack + { aliasColors: { @@ -143,7 +143,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, containerCPUUsagePanel(title, containerName):: - $.panel(title) + + $.timeseriesPanel(title) + $.queryPanel([ 'sum by(%s) (rate(container_cpu_usage_seconds_total{%s,container=~"%s"}[$__rate_interval]))' % [$._config.per_instance_label, $.namespaceMatcher(), containerName], 'min(container_spec_cpu_quota{%s,container=~"%s"} / container_spec_cpu_period{%s,container=~"%s"})' % [$.namespaceMatcher(), containerName, $.namespaceMatcher(), containerName], @@ -160,7 +160,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, containerMemoryWorkingSetPanel(title, containerName):: - $.panel(title) + + $.timeseriesPanel(title) + $.queryPanel([ // We use "max" instead of "sum" otherwise during a rolling update of a statefulset we will end up // summing the memory of the old instance/pod (whose metric will be stale for 5m) to the new instance/pod. @@ -180,7 +180,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, containerNetworkPanel(title, metric, instanceName):: - $.panel(title) + + $.timeseriesPanel(title) + $.queryPanel( 'sum by(%(instance)s) (rate(%(metric)s{%(namespace)s,%(instance)s=~"%(instanceName)s"}[$__rate_interval]))' % { namespace: $.namespaceMatcher(), @@ -199,7 +199,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.containerNetworkPanel('Transmit Bandwidth', 'container_network_transmit_bytes_total', instanceName), containerDiskWritesPanel(title, containerName):: - $.panel(title) + + $.timeseriesPanel(title) + $.queryPanel( ||| sum by(%s, %s, device) ( @@ -220,7 +220,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; { yaxes: $.yaxes('Bps') }, containerDiskReadsPanel(title, containerName):: - $.panel(title) + + $.timeseriesPanel(title) + $.queryPanel( ||| sum by(%s, %s, device) ( @@ -239,7 +239,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; { yaxes: $.yaxes('Bps') }, containerDiskSpaceUtilization(title, containerName):: - $.panel(title) + + $.timeseriesPanel(title) + $.queryPanel( ||| max by(persistentvolumeclaim) ( @@ -266,7 +266,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; else 'label_name="%s"' % containerName, goHeapInUsePanel(title, jobName):: - $.panel(title) + + $.timeseriesPanel(title) + $.queryPanel( 'sum by(%s) (go_memstats_heap_inuse_bytes{%s})' % [$._config.per_instance_label, $.jobMatcher(jobName)], '{{%s}}' % $._config.per_instance_label @@ -361,39 +361,39 @@ local utils = import 'mixin-utils/utils.libsonnet'; getObjectStoreRows(title, component):: [ super.row(title) .addPanel( - $.panel('Operations / sec') + + $.timeseriesPanel('Operations / sec') + $.queryPanel('sum by(operation) (rate(thanos_objstore_bucket_operations_total{%s,component="%s"}[$__rate_interval]))' % [$.namespaceMatcher(), component], '{{operation}}') + $.stack + { yaxes: $.yaxes('rps') }, ) .addPanel( - $.panel('Error rate') + + $.timeseriesPanel('Error rate') + $.queryPanel('sum by(operation) (rate(thanos_objstore_bucket_operation_failures_total{%s,component="%s"}[$__rate_interval])) / sum by(operation) (rate(thanos_objstore_bucket_operations_total{%s,component="%s"}[$__rate_interval]))' % [$.namespaceMatcher(), component, $.namespaceMatcher(), component], '{{operation}}') + { yaxes: $.yaxes('percentunit') }, ) .addPanel( - $.panel('Latency of Op: Attributes') + + $.timeseriesPanel('Latency of Op: Attributes') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="attributes"}' % [$.namespaceMatcher(), component]), ) .addPanel( - $.panel('Latency of Op: Exists') + + $.timeseriesPanel('Latency of Op: Exists') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="exists"}' % [$.namespaceMatcher(), component]), ), $.row('') .addPanel( - $.panel('Latency of Op: Get') + + $.timeseriesPanel('Latency of Op: Get') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="get"}' % [$.namespaceMatcher(), component]), ) .addPanel( - $.panel('Latency of Op: GetRange') + + $.timeseriesPanel('Latency of Op: GetRange') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="get_range"}' % [$.namespaceMatcher(), component]), ) .addPanel( - $.panel('Latency of Op: Upload') + + $.timeseriesPanel('Latency of Op: Upload') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="upload"}' % [$.namespaceMatcher(), component]), ) .addPanel( - $.panel('Latency of Op: Delete') + + $.timeseriesPanel('Latency of Op: Delete') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="delete"}' % [$.namespaceMatcher(), component]), ), ], @@ -406,7 +406,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; }; super.row(title) .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.queryPanel( ||| sum by(operation) ( @@ -425,7 +425,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; { yaxes: $.yaxes('ops') } ) .addPanel( - $.panel('Latency (getmulti)') + + $.timeseriesPanel('Latency (getmulti)') + $.latencyPanel( 'thanos_memcached_operation_duration_seconds', ||| @@ -439,7 +439,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('Hit ratio') + + $.timeseriesPanel('Hit ratio') + $.queryPanel( ||| sum( diff --git a/cortex-mixin/dashboards/object-store.libsonnet b/cortex-mixin/dashboards/object-store.libsonnet index 69e257b..0fc1824 100644 --- a/cortex-mixin/dashboards/object-store.libsonnet +++ b/cortex-mixin/dashboards/object-store.libsonnet @@ -7,13 +7,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Components') .addPanel( - $.panel('RPS / component') + + $.timeseriesPanel('RPS / component') + $.queryPanel('sum by(component) (rate(thanos_objstore_bucket_operations_total{%s}[$__rate_interval]))' % $.namespaceMatcher(), '{{component}}') + $.stack + { yaxes: $.yaxes('rps') }, ) .addPanel( - $.panel('Error rate / component') + + $.timeseriesPanel('Error rate / component') + $.queryPanel('sum by(component) (rate(thanos_objstore_bucket_operation_failures_total{%s}[$__rate_interval])) / sum by(component) (rate(thanos_objstore_bucket_operations_total{%s}[$__rate_interval]))' % [$.namespaceMatcher(), $.namespaceMatcher()], '{{component}}') + { yaxes: $.yaxes('percentunit') }, ) @@ -21,13 +21,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Operations') .addPanel( - $.panel('RPS / operation') + + $.timeseriesPanel('RPS / operation') + $.queryPanel('sum by(operation) (rate(thanos_objstore_bucket_operations_total{%s}[$__rate_interval]))' % $.namespaceMatcher(), '{{operation}}') + $.stack + { yaxes: $.yaxes('rps') }, ) .addPanel( - $.panel('Error rate / operation') + + $.timeseriesPanel('Error rate / operation') + $.queryPanel('sum by(operation) (rate(thanos_objstore_bucket_operation_failures_total{%s}[$__rate_interval])) / sum by(operation) (rate(thanos_objstore_bucket_operations_total{%s}[$__rate_interval]))' % [$.namespaceMatcher(), $.namespaceMatcher()], '{{operation}}') + { yaxes: $.yaxes('percentunit') }, ) @@ -35,30 +35,30 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('') .addPanel( - $.panel('Op: Get') + + $.timeseriesPanel('Op: Get') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,operation="get"}' % $.namespaceMatcher()), ) .addPanel( - $.panel('Op: GetRange') + + $.timeseriesPanel('Op: GetRange') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,operation="get_range"}' % $.namespaceMatcher()), ) .addPanel( - $.panel('Op: Exists') + + $.timeseriesPanel('Op: Exists') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,operation="exists"}' % $.namespaceMatcher()), ) ) .addRow( $.row('') .addPanel( - $.panel('Op: Attributes') + + $.timeseriesPanel('Op: Attributes') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,operation="attributes"}' % $.namespaceMatcher()), ) .addPanel( - $.panel('Op: Upload') + + $.timeseriesPanel('Op: Upload') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,operation="upload"}' % $.namespaceMatcher()), ) .addPanel( - $.panel('Op: Delete') + + $.timeseriesPanel('Op: Delete') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,operation="delete"}' % $.namespaceMatcher()), ) ), diff --git a/cortex-mixin/dashboards/queries.libsonnet b/cortex-mixin/dashboards/queries.libsonnet index cada5c8..04b30ea 100644 --- a/cortex-mixin/dashboards/queries.libsonnet +++ b/cortex-mixin/dashboards/queries.libsonnet @@ -8,34 +8,34 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Query Frontend') .addPanel( - $.panel('Queue Duration') + + $.timeseriesPanel('Queue Duration') + $.latencyPanel('cortex_query_frontend_queue_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.query_frontend)), ) .addPanel( - $.panel('Retries') + + $.timeseriesPanel('Retries') + $.latencyPanel('cortex_query_frontend_retries', '{%s}' % $.jobMatcher($._config.job_names.query_frontend), multiplier=1) + { yaxes: $.yaxes('short') }, ) .addPanel( - $.panel('Queue Length') + + $.timeseriesPanel('Queue Length') + $.queryPanel('cortex_query_frontend_queue_length{%s}' % $.jobMatcher($._config.job_names.query_frontend), '{{cluster}} / {{namespace}} / {{%s}}' % $._config.per_instance_label), ) ) .addRow( $.row('Query Scheduler') .addPanel( - $.panel('Queue Duration') + + $.timeseriesPanel('Queue Duration') + $.latencyPanel('cortex_query_scheduler_queue_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.query_scheduler)), ) .addPanel( - $.panel('Queue Length') + + $.timeseriesPanel('Queue Length') + $.queryPanel('cortex_query_scheduler_queue_length{%s}' % $.jobMatcher($._config.job_names.query_scheduler), '{{cluster}} / {{namespace}} / {{%s}}' % $._config.per_instance_label), ) ) .addRow( $.row('Query Frontend - Query Splitting and Results Cache') .addPanel( - $.panel('Intervals per Query') + + $.timeseriesPanel('Intervals per Query') + $.queryPanel('sum(rate(cortex_frontend_split_queries_total{%s}[1m])) / sum(rate(cortex_frontend_query_range_duration_seconds_count{%s, method="split_by_interval"}[1m]))' % [$.jobMatcher($._config.job_names.query_frontend), $.jobMatcher($._config.job_names.query_frontend)], 'splitting rate') + $.panelDescription( 'Intervals per Query', @@ -45,7 +45,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('Results Cache Hit %') + + $.timeseriesPanel('Results Cache Hit %') + $.queryPanel(||| sum(rate(cortex_cache_hits{name=~"frontend.+", %(q)s}[1m])) / sum(rate(cortex_cache_fetched_keys{name=~"frontend.+", %(q)s}[1m])) or sum(rate(cortex_cache_hits_total{name=~"frontend.+", %(q)s}[1m])) / sum(rate(cortex_cache_fetched_keys_total{name=~"frontend.+", %(q)s}[1m])) @@ -53,7 +53,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) }, ) .addPanel( - $.panel('Results Cache misses') + + $.timeseriesPanel('Results Cache misses') + $.queryPanel(||| sum(rate(cortex_cache_fetched_keys{name=~"frontend.+", %(q)s}[1m])) - sum(rate(cortex_cache_hits{name=~"frontend.+", %(q)s}[1m])) or sum(rate(cortex_cache_fetched_keys_total{name=~"frontend.+", %(q)s}[1m])) - sum(rate(cortex_cache_hits_total{name=~"frontend.+", %(q)s}[1m])) @@ -63,7 +63,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Query Frontend - Query sharding') .addPanel( - $.panel('Sharded Queries Ratio') + + $.timeseriesPanel('Sharded Queries Ratio') + $.queryPanel(||| sum(rate(cortex_frontend_query_sharding_rewrites_succeeded_total{%s}[$__rate_interval])) / sum(rate(cortex_frontend_query_sharding_rewrites_attempted_total{%s}[$__rate_interval])) @@ -78,7 +78,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('Number of Sharded Queries per Query') + + $.timeseriesPanel('Number of Sharded Queries per Query') + $.latencyPanel('cortex_frontend_sharded_queries_per_query', '{%s}' % $.jobMatcher($._config.job_names.query_frontend), multiplier=1) + { yaxes: $.yaxes('short') } + $.panelDescription( @@ -93,37 +93,37 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Querier') .addPanel( - $.panel('Stages') + + $.timeseriesPanel('Stages') + $.queryPanel('max by (slice) (prometheus_engine_query_duration_seconds{quantile="0.9",%s}) * 1e3' % $.jobMatcher($._config.job_names.querier), '{{slice}}') + { yaxes: $.yaxes('ms') } + $.stack, ) .addPanel( - $.panel('Chunk cache misses') + + $.timeseriesPanel('Chunk cache misses') + $.queryPanel(||| sum(rate(cortex_cache_fetched_keys{%(q)s,name="chunksmemcache"}[1m])) - sum(rate(cortex_cache_hits{%(q)s,name="chunksmemcache"}[1m])) or sum(rate(cortex_cache_fetched_keys_total{%(q)s,name="chunksmemcache"}[1m])) - sum(rate(cortex_cache_hits_total{%(q)s,name="chunksmemcache"}[1m])) ||| % { q: $.jobMatcher($._config.job_names.query_frontend) }, 'Hit rate'), ) .addPanel( - $.panel('Chunk cache corruptions') + + $.timeseriesPanel('Chunk cache corruptions') + $.queryPanel('sum(rate(cortex_cache_corrupt_chunks_total{%s}[1m]))' % $.jobMatcher($._config.job_names.querier), 'Corrupt chunks'), ) ) .addRow( $.row('Ingester') .addPanel( - $.panel('Series per Query') + + $.timeseriesPanel('Series per Query') + utils.latencyRecordingRulePanel('cortex_ingester_queried_series', $.jobSelector($._config.job_names.ingester), multiplier=1) + { yaxes: $.yaxes('short') }, ) .addPanel( - $.panel('Chunks per Query') + + $.timeseriesPanel('Chunks per Query') + utils.latencyRecordingRulePanel('cortex_ingester_queried_chunks', $.jobSelector($._config.job_names.ingester), multiplier=1) + { yaxes: $.yaxes('short') }, ) .addPanel( - $.panel('Samples per Query') + + $.timeseriesPanel('Samples per Query') + utils.latencyRecordingRulePanel('cortex_ingester_queried_samples', $.jobSelector($._config.job_names.ingester), multiplier=1) + { yaxes: $.yaxes('short') }, ) @@ -132,17 +132,17 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.storage_engine, 'blocks'), $.row('Querier - Blocks storage') .addPanel( - $.panel('Number of store-gateways hit per Query') + + $.timeseriesPanel('Number of store-gateways hit per Query') + $.latencyPanel('cortex_querier_storegateway_instances_hit_per_query', '{%s}' % $.jobMatcher($._config.job_names.querier), multiplier=1) + { yaxes: $.yaxes('short') }, ) .addPanel( - $.panel('Refetches of missing blocks per Query') + + $.timeseriesPanel('Refetches of missing blocks per Query') + $.latencyPanel('cortex_querier_storegateway_refetches_per_query', '{%s}' % $.jobMatcher($._config.job_names.querier), multiplier=1) + { yaxes: $.yaxes('short') }, ) .addPanel( - $.panel('Consistency checks failed') + + $.timeseriesPanel('Consistency checks failed') + $.queryPanel('sum(rate(cortex_querier_blocks_consistency_checks_failed_total{%s}[1m])) / sum(rate(cortex_querier_blocks_consistency_checks_total{%s}[1m]))' % [$.jobMatcher($._config.job_names.querier), $.jobMatcher($._config.job_names.querier)], 'Failure Rate') + { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) }, ) @@ -151,7 +151,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.storage_engine, 'blocks'), $.row('') .addPanel( - $.panel('Bucket indexes loaded (per querier)') + + $.timeseriesPanel('Bucket indexes loaded (per querier)') + $.queryPanel([ 'max(cortex_bucket_index_loaded{%s})' % $.jobMatcher($._config.job_names.querier), 'min(cortex_bucket_index_loaded{%s})' % $.jobMatcher($._config.job_names.querier), @@ -167,7 +167,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('Bucket indexes load latency') + + $.timeseriesPanel('Bucket indexes load latency') + $.latencyPanel('cortex_bucket_index_load_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.querier)), ) ) @@ -175,18 +175,18 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.storage_engine, 'blocks'), $.row('Store-gateway - Blocks storage') .addPanel( - $.panel('Blocks queried / sec') + + $.timeseriesPanel('Blocks queried / sec') + $.queryPanel('sum(rate(cortex_bucket_store_series_blocks_queried_sum{component="store-gateway",%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.store_gateway), 'blocks') + { yaxes: $.yaxes('ops') }, ) .addPanel( - $.panel('Data fetched / sec') + + $.timeseriesPanel('Data fetched / sec') + $.queryPanel('sum by(data_type) (rate(cortex_bucket_store_series_data_fetched_sum{component="store-gateway",%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.store_gateway), '{{data_type}}') + $.stack + { yaxes: $.yaxes('ops') }, ) .addPanel( - $.panel('Data touched / sec') + + $.timeseriesPanel('Data touched / sec') + $.queryPanel('sum by(data_type) (rate(cortex_bucket_store_series_data_touched_sum{component="store-gateway",%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.store_gateway), '{{data_type}}') + $.stack + { yaxes: $.yaxes('ops') }, @@ -196,15 +196,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.storage_engine, 'blocks'), $.row('') .addPanel( - $.panel('Series fetch duration (per request)') + + $.timeseriesPanel('Series fetch duration (per request)') + $.latencyPanel('cortex_bucket_store_series_get_all_duration_seconds', '{component="store-gateway",%s}' % $.jobMatcher($._config.job_names.store_gateway)), ) .addPanel( - $.panel('Series merge duration (per request)') + + $.timeseriesPanel('Series merge duration (per request)') + $.latencyPanel('cortex_bucket_store_series_merge_duration_seconds', '{component="store-gateway",%s}' % $.jobMatcher($._config.job_names.store_gateway)), ) .addPanel( - $.panel('Series returned (per request)') + + $.timeseriesPanel('Series returned (per request)') + $.queryPanel('sum(rate(cortex_bucket_store_series_result_series_sum{component="store-gateway",%s}[$__rate_interval])) / sum(rate(cortex_bucket_store_series_result_series_count{component="store-gateway",%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], 'avg series returned'), ) ) @@ -212,7 +212,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.storage_engine, 'blocks'), $.row('') .addPanel( - $.panel('Blocks currently loaded') + + $.timeseriesPanel('Blocks currently loaded') + $.queryPanel('sum(cortex_bucket_store_blocks_loaded{component="store-gateway",%s}) without (user)' % $.jobMatcher($._config.job_names.store_gateway), '{{%s}}' % $._config.per_instance_label) ) .addPanel( @@ -234,15 +234,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.storage_engine, 'blocks'), $.row('') .addPanel( - $.panel('Lazy loaded index-headers') + + $.timeseriesPanel('Lazy loaded index-headers') + $.queryPanel('cortex_bucket_store_indexheader_lazy_load_total{%s} - cortex_bucket_store_indexheader_lazy_unload_total{%s}' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], '{{%s}}' % $._config.per_instance_label) ) .addPanel( - $.panel('Index-header lazy load duration') + + $.timeseriesPanel('Index-header lazy load duration') + $.latencyPanel('cortex_bucket_store_indexheader_lazy_load_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.store_gateway)), ) .addPanel( - $.panel('Series hash cache hit ratio') + + $.timeseriesPanel('Series hash cache hit ratio') + $.queryPanel(||| sum(rate(cortex_bucket_store_series_hash_cache_hits_total{%s}[$__rate_interval])) / diff --git a/cortex-mixin/dashboards/reads-resources.libsonnet b/cortex-mixin/dashboards/reads-resources.libsonnet index f0750c8..437a57a 100644 --- a/cortex-mixin/dashboards/reads-resources.libsonnet +++ b/cortex-mixin/dashboards/reads-resources.libsonnet @@ -67,7 +67,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Ruler') .addPanel( - $.panel('Rules') + + $.timeseriesPanel('Rules') + $.queryPanel( 'sum by(%s) (cortex_prometheus_rule_group_rules{%s})' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ruler)], '{{%s}}' % $._config.per_instance_label diff --git a/cortex-mixin/dashboards/reads.libsonnet b/cortex-mixin/dashboards/reads.libsonnet index 5a72078..cd60aaf 100644 --- a/cortex-mixin/dashboards/reads.libsonnet +++ b/cortex-mixin/dashboards/reads.libsonnet @@ -36,7 +36,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; showTitle: false, }) .addPanel( - $.panel('Instant queries / sec') + + $.timeseriesPanel('Instant queries / sec') + $.statPanel(||| sum( rate( @@ -67,7 +67,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('Range queries / sec') + + $.timeseriesPanel('Range queries / sec') + $.statPanel(||| sum( rate( @@ -92,15 +92,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Gateway') .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.gateway)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) ) .addPanel( - $.panel('Per %s p99 Latency' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.gateway)], '' ) + @@ -110,15 +110,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Query Frontend') .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.query_frontend)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) ) .addPanel( - $.panel('Per %s p99 Latency' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.query_frontend)], '' ) + @@ -142,37 +142,37 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_query_scheduler_queue_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.query_scheduler)) ) .addPanel( - $.panel('Latency (Time in Queue)') + + $.timeseriesPanel('Latency (Time in Queue)') + $.latencyPanel('cortex_query_scheduler_queue_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.query_scheduler)) ) ) .addRow( $.row('Cache - Query Results') .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_cache_request_duration_seconds_count{method=~"frontend.+", %s}' % $.jobMatcher($._config.job_names.query_frontend)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency') + utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('method', 'frontend.+')]) ) ) .addRow( $.row('Querier') .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_querier_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.querier)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency') + utils.latencyRecordingRulePanel('cortex_querier_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) ) .addPanel( - $.panel('Per %s p99 Latency' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_querier_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.querier)], '' ) + @@ -182,15 +182,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Ingester') .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.re('route', '/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata')]) ) .addPanel( - $.panel('Per %s p99 Latency' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ingester)], '' ) + @@ -201,15 +201,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.storage_engine, 'blocks'), $.row('Store-gateway') .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/gatewaypb.StoreGateway/.*"}' % $.jobMatcher($._config.job_names.store_gateway)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.store_gateway) + [utils.selector.re('route', '/gatewaypb.StoreGateway/.*')]) ) .addPanel( - $.panel('Per %s p99 Latency' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/gatewaypb.StoreGateway/.*"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.store_gateway)], '' ) + @@ -220,7 +220,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.storage_engine, 'blocks'), $.row('Memcached – Blocks storage – Block index cache (store-gateway accesses)') // Resembles thanosMemcachedCache .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.queryPanel( ||| sum by(operation) ( @@ -238,7 +238,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; { yaxes: $.yaxes('ops') }, ) .addPanel( - $.panel('Latency (getmulti)') + + $.timeseriesPanel('Latency (getmulti)') + $.latencyPanel( 'thanos_memcached_operation_duration_seconds', ||| @@ -252,7 +252,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('Hit ratio') + + $.timeseriesPanel('Hit ratio') + $.queryPanel( ||| sum by(item_type) ( diff --git a/cortex-mixin/dashboards/rollout-progress.libsonnet b/cortex-mixin/dashboards/rollout-progress.libsonnet index 16c5409..baae9fe 100644 --- a/cortex-mixin/dashboards/rollout-progress.libsonnet +++ b/cortex-mixin/dashboards/rollout-progress.libsonnet @@ -20,7 +20,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; // // Rollout progress // - $.panel('Rollout progress') + + $.timeseriesPanel('Rollout progress') + $.barGauge([ // Multi-zone deployments are grouped together removing the "zone-X" suffix. // After the grouping, the resulting label is called "cortex_service". @@ -89,7 +89,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; // // Writes // - $.panel('Writes - 2xx') + + $.timeseriesPanel('Writes - 2xx') + $.newStatPanel(||| sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s",status_code=~"2.+"}[$__rate_interval])) / sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"}[$__rate_interval])) @@ -100,7 +100,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; gridPos: { h: 4, w: 2, x: 10, y: 0 }, }, - $.panel('Writes - 4xx') + + $.timeseriesPanel('Writes - 4xx') + $.newStatPanel(||| sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s",status_code=~"4.+"}[$__rate_interval])) / sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"}[$__rate_interval])) @@ -113,7 +113,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; gridPos: { h: 4, w: 2, x: 12, y: 0 }, }, - $.panel('Writes - 5xx') + + $.timeseriesPanel('Writes - 5xx') + $.newStatPanel(||| sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s",status_code=~"5.+"}[$__rate_interval])) / sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"}[$__rate_interval])) @@ -125,7 +125,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; gridPos: { h: 4, w: 2, x: 14, y: 0 }, }, - $.panel('Writes 99th Latency') + + $.timeseriesPanel('Writes 99th Latency') + $.newStatPanel(||| histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"})) ||| % config, unit='s', thresholds=[ @@ -140,7 +140,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; // // Reads // - $.panel('Reads - 2xx') + + $.timeseriesPanel('Reads - 2xx') + $.newStatPanel(||| sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s",status_code=~"2.+"}[$__rate_interval])) / sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"}[$__rate_interval])) @@ -151,7 +151,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; gridPos: { h: 4, w: 2, x: 10, y: 4 }, }, - $.panel('Reads - 4xx') + + $.timeseriesPanel('Reads - 4xx') + $.newStatPanel(||| sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s",status_code=~"4.+"}[$__rate_interval])) / sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"}[$__rate_interval])) @@ -164,7 +164,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; gridPos: { h: 4, w: 2, x: 12, y: 4 }, }, - $.panel('Reads - 5xx') + + $.timeseriesPanel('Reads - 5xx') + $.newStatPanel(||| sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s",status_code=~"5.+"}[$__rate_interval])) / sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"}[$__rate_interval])) @@ -176,7 +176,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; gridPos: { h: 4, w: 2, x: 14, y: 4 }, }, - $.panel('Reads 99th Latency') + + $.timeseriesPanel('Reads 99th Latency') + $.newStatPanel(||| histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"})) ||| % config, unit='s', thresholds=[ @@ -191,7 +191,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; // // Unhealthy pods // - $.panel('Unhealthy pods') + + $.timeseriesPanel('Unhealthy pods') + $.newStatPanel([ ||| kube_deployment_status_replicas_unavailable{%(namespace_matcher)s, deployment=~"%(all_services_regex)s"} @@ -280,7 +280,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; // // Performance comparison with 24h ago // - $.panel('Latency vs 24h ago') + + $.timeseriesPanel('Latency vs 24h ago') + $.queryPanel([||| 1 - ( avg_over_time(histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"} offset 24h))[1h:]) diff --git a/cortex-mixin/dashboards/ruler.libsonnet b/cortex-mixin/dashboards/ruler.libsonnet index b243198..a63bf96 100644 --- a/cortex-mixin/dashboards/ruler.libsonnet +++ b/cortex-mixin/dashboards/ruler.libsonnet @@ -67,26 +67,26 @@ local utils = import 'mixin-utils/utils.libsonnet'; showTitle: false, }) .addPanel( - $.panel('Active Configurations') + + $.timeseriesPanel('Active Configurations') + $.statPanel('sum(cortex_ruler_managers_total{%s})' % $.jobMatcher($._config.job_names.ruler), format='short') ) .addPanel( - $.panel('Total Rules') + + $.timeseriesPanel('Total Rules') + $.statPanel('sum(cortex_prometheus_rule_group_rules{%s})' % $.jobMatcher($._config.job_names.ruler), format='short') ) .addPanel( - $.panel('Read from Ingesters - QPS') + + $.timeseriesPanel('Read from Ingesters - QPS') + $.statPanel('sum(rate(cortex_ingester_client_request_duration_seconds_count{%s, operation="/cortex.Ingester/QueryStream"}[5m]))' % $.jobMatcher($._config.job_names.ruler), format='reqps') ) .addPanel( - $.panel('Write to Ingesters - QPS') + + $.timeseriesPanel('Write to Ingesters - QPS') + $.statPanel('sum(rate(cortex_ingester_client_request_duration_seconds_count{%s, operation="/cortex.Ingester/Push"}[5m]))' % $.jobMatcher($._config.job_names.ruler), format='reqps') ) ) .addRow( $.row('Rule Evaluations Global') .addPanel( - $.panel('EPS') + + $.timeseriesPanel('EPS') + $.queryPanel( [ $.rulerQueries.ruleEvaluations.success % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], @@ -96,7 +96,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency') + $.queryPanel( $.rulerQueries.ruleEvaluations.latency % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], 'average' @@ -106,15 +106,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Configuration API (gateway)') .addPanel( - $.panel('QPS') + + $.timeseriesPanel('QPS') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"%s"}' % [$.jobMatcher($._config.job_names.gateway), ruler_config_api_routes_re]) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', ruler_config_api_routes_re)]) ) .addPanel( - $.panel('Per route p99 Latency') + + $.timeseriesPanel('Per route p99 Latency') + $.queryPanel( 'histogram_quantile(0.99, sum by (route, le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%s, route=~"%s"}))' % [$.jobMatcher($._config.job_names.gateway), ruler_config_api_routes_re], '{{ route }}' @@ -125,22 +125,22 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Writes (Ingesters)') .addPanel( - $.panel('QPS') + + $.timeseriesPanel('QPS') + $.qpsPanel('cortex_ingester_client_request_duration_seconds_count{%s, operation="/cortex.Ingester/Push"}' % $.jobMatcher($._config.job_names.ruler)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency') + $.latencyPanel('cortex_ingester_client_request_duration_seconds', '{%s, operation="/cortex.Ingester/Push"}' % $.jobMatcher($._config.job_names.ruler)) ) ) .addRow( $.row('Reads (Ingesters)') .addPanel( - $.panel('QPS') + + $.timeseriesPanel('QPS') + $.qpsPanel('cortex_ingester_client_request_duration_seconds_count{%s, operation="/cortex.Ingester/QueryStream"}' % $.jobMatcher($._config.job_names.ruler)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency') + $.latencyPanel('cortex_ingester_client_request_duration_seconds', '{%s, operation="/cortex.Ingester/QueryStream"}' % $.jobMatcher($._config.job_names.ruler)) ) ) @@ -148,17 +148,17 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.storage_engine, 'blocks'), $.row('Ruler - Blocks storage') .addPanel( - $.panel('Number of store-gateways hit per Query') + + $.timeseriesPanel('Number of store-gateways hit per Query') + $.latencyPanel('cortex_querier_storegateway_instances_hit_per_query', '{%s}' % $.jobMatcher($._config.job_names.ruler), multiplier=1) + { yaxes: $.yaxes('short') }, ) .addPanel( - $.panel('Refetches of missing blocks per Query') + + $.timeseriesPanel('Refetches of missing blocks per Query') + $.latencyPanel('cortex_querier_storegateway_refetches_per_query', '{%s}' % $.jobMatcher($._config.job_names.ruler), multiplier=1) + { yaxes: $.yaxes('short') }, ) .addPanel( - $.panel('Consistency checks failed') + + $.timeseriesPanel('Consistency checks failed') + $.queryPanel('sum(rate(cortex_querier_blocks_consistency_checks_failed_total{%s}[1m])) / sum(rate(cortex_querier_blocks_consistency_checks_total{%s}[1m]))' % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], 'Failure Rate') + { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) }, ) @@ -166,33 +166,33 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Notifications') .addPanel( - $.panel('Delivery Errors') + + $.timeseriesPanel('Delivery Errors') + $.queryPanel($.rulerQueries.notifications.failure % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], '{{ user }}') ) .addPanel( - $.panel('Queue Length') + + $.timeseriesPanel('Queue Length') + $.queryPanel($.rulerQueries.notifications.queue % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], '{{ user }}') ) .addPanel( - $.panel('Dropped') + + $.timeseriesPanel('Dropped') + $.queryPanel($.rulerQueries.notifications.dropped % $.jobMatcher($._config.job_names.ruler), '{{ user }}') ) ) .addRow( ($.row('Group Evaluations') + { collapse: true }) .addPanel( - $.panel('Missed Iterations') + + $.timeseriesPanel('Missed Iterations') + $.queryPanel($.rulerQueries.groupEvaluations.missedIterations % $.jobMatcher($._config.job_names.ruler), '{{ user }}'), ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency') + $.queryPanel( $.rulerQueries.groupEvaluations.latency % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], '{{ user }}' ), ) .addPanel( - $.panel('Failures') + + $.timeseriesPanel('Failures') + $.queryPanel( $.rulerQueries.perUserPerGroupEvaluations.failure % [$.jobMatcher($._config.job_names.ruler)], '{{ rule_group }}' ) @@ -201,7 +201,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( ($.row('Rule Evaluation per User') + { collapse: true }) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency') + $.queryPanel( $.rulerQueries.perUserPerGroupEvaluations.latency % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], '{{ user }}' diff --git a/cortex-mixin/dashboards/scaling.libsonnet b/cortex-mixin/dashboards/scaling.libsonnet index 6ac244e..e078a35 100644 --- a/cortex-mixin/dashboards/scaling.libsonnet +++ b/cortex-mixin/dashboards/scaling.libsonnet @@ -38,7 +38,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( ($.row('Scaling') + { height: '400px' }) .addPanel( - $.panel('Workload-based scaling') + { sort: { col: 0, desc: false } } + + $.timeseriesPanel('Workload-based scaling') + { sort: { col: 0, desc: false } } + $.tablePanel([ ||| sort_desc( diff --git a/cortex-mixin/dashboards/writes-resources.libsonnet b/cortex-mixin/dashboards/writes-resources.libsonnet index 64f83ef..e11ac22 100644 --- a/cortex-mixin/dashboards/writes-resources.libsonnet +++ b/cortex-mixin/dashboards/writes-resources.libsonnet @@ -31,7 +31,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Ingester') .addPanel( - $.panel('In-memory series') + + $.timeseriesPanel('In-memory series') + $.queryPanel( 'sum by(%s) (cortex_ingester_memory_series{%s})' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ingester)], '{{%s}}' % $._config.per_instance_label diff --git a/cortex-mixin/dashboards/writes.libsonnet b/cortex-mixin/dashboards/writes.libsonnet index c656364..81ffc43 100644 --- a/cortex-mixin/dashboards/writes.libsonnet +++ b/cortex-mixin/dashboards/writes.libsonnet @@ -34,7 +34,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; showTitle: false, }) .addPanel( - $.panel('Samples / sec') + + $.timeseriesPanel('Samples / sec') + $.statPanel( 'sum(%(group_prefix_jobs)s:cortex_distributor_received_samples:rate5m{%(job)s})' % ( $._config { @@ -45,7 +45,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('Active Series') + + $.timeseriesPanel('Active Series') + $.statPanel(||| sum(cortex_ingester_memory_series{%(ingester)s} / on(%(group_by_cluster)s) group_left @@ -56,26 +56,26 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, format='short') ) .addPanel( - $.panel('Tenants') + + $.timeseriesPanel('Tenants') + $.statPanel('count(count by(user) (cortex_ingester_active_series{%s}))' % $.jobMatcher($._config.job_names.ingester), format='short') ) .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.statPanel('sum(rate(cortex_request_duration_seconds_count{%s, route=~"api_(v1|prom)_push"}[5m]))' % $.jobMatcher($._config.job_names.gateway), format='reqps') ) ) .addRow( $.row('Gateway') .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.gateway)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', 'api_(v1|prom)_push')]) ) .addPanel( - $.panel('Per %s p99 Latency' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"api_(v1|prom)_push"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.gateway)], '' ) + @@ -85,15 +85,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Distributor') .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.distributor)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.distributor) + [utils.selector.re('route', '/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push')]) ) .addPanel( - $.panel('Per %s p99 Latency' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.distributor)], '' ) + @@ -103,26 +103,26 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Key-value store for high-availability (HA) deduplication') .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.distributor)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency') + utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.distributor)) ) ) .addRow( $.row('Ingester') .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_request_duration_seconds_count{%s,route="/cortex.Ingester/Push"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.eq('route', '/cortex.Ingester/Push')]) ) .addPanel( - $.panel('Per %s p99 Latency' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route="/cortex.Ingester/Push"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ingester)], '' ) + @@ -132,11 +132,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Key-value store for the ingesters ring') .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency') + utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.ingester)) ) ) @@ -158,7 +158,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('Upload latency') + + $.timeseriesPanel('Upload latency') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="ingester",operation="upload"}' % $.jobMatcher($._config.job_names.ingester)) + $.panelDescription( 'Upload latency', @@ -188,7 +188,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('Compactions latency') + + $.timeseriesPanel('Compactions latency') + $.latencyPanel('cortex_ingester_tsdb_compaction_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.ingester)) + $.panelDescription( 'Compaction latency', @@ -231,7 +231,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('WAL truncations latency (includes checkpointing)') + + $.timeseriesPanel('WAL truncations latency (includes checkpointing)') + $.queryPanel('sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_sum{%s}[$__rate_interval])) / sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_count{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], 'avg') + { yaxes: $.yaxes('s') } + $.panelDescription( @@ -243,7 +243,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('Corruptions / sec') + + $.timeseriesPanel('Corruptions / sec') + $.queryPanel([ 'sum(rate(cortex_ingester_wal_corruptions_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester), 'sum(rate(cortex_ingester_tsdb_mmap_chunk_corruptions_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester),