diff --git a/monitoring/pra/dashboard.json b/monitoring/pra/dashboard.json new file mode 100644 index 0000000000..3ee29d2509 --- /dev/null +++ b/monitoring/pra/dashboard.json @@ -0,0 +1,1197 @@ +{ + "__inputs": [ + { + "description": "Prometheus server that will be used for all panels in the dashboard.", + "label": "Prometheus", + "name": "DS_PROMETHEUS", + "pluginId": "prometheus", + "pluginName": "Prometheus", + "type": "datasource" + }, + { + "description": "Namespace associated with the Zenko instance", + "label": "namespace", + "name": "namespace", + "type": "constant", + "value": "zenko" + }, + { + "description": "Name of the ZenkoDR instance", + "label": "zenko instance name", + "name": "zenkoName", + "type": "constant", + "value": "artesca-data" + }, + { + "description": "Name of the kafka instance/job/cluster_name", + "label": "kafka instance", + "name": "kafka_instance", + "type": "constant", + "value": "artesca-data-dr-base-queue" + }, + { + "description": "Name of the kafka connect job", + "label": "kafka connect source job", + "name": "kafka_connect_src_job", + "type": "constant", + "value": "artesca-data-dr-base-queue-connector-metrics" + }, + { + "description": "Name of the kafka connect job", + "label": "kafka connect sink job", + "name": "kafka_connect_sink_job", + "type": "constant", + "value": "artesca-data-dr-base-queue-connector-metrics" + }, + { + "description": "Promethes label expression for lifecycle jobs", + "label": "Lifecycle jobs", + "name": "lifecycle_jobs", + "type": "constant", + "value": "artesca-data-backbeat-lifecycle-.*-headless" + }, + { + "description": "Promethes label expression for mongo jobs", + "label": "MongoDb jobs", + "name": "mongo_jobs", + "type": "constant", + "value": "zenko/data-db-mongodb-sharded-shard.*" + }, + { + "description": "Promethes label expression to filter PRA locations", + "label": "Locations", + "name": "locations", + "type": "constant", + "value": "glacier" + }, + { + "description": "Expected number of replicas", + "label": "Replicas", + "name": "replicas", + "type": "constant", + "value": "1" + } + ], + "annotations": { + "list": [] + }, + "description": "", + "editable": true, + "gnetId": null, + "hideControls": false, + "id": null, + "links": [], + "panels": [ + { + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": null, + "mappings": [], + "max": 1, + "min": "0", + "noValue": "0", + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "red", + "index": 0, + "line": true, + "op": "gt", + "value": "null", + "yaxis": "left" + }, + { + "color": "yellow", + "index": 1, + "line": true, + "op": "gt", + "value": 50.0, + "yaxis": "left" + }, + { + "color": "green", + "index": 2, + "line": true, + "op": "gt", + "value": 100.0, + "yaxis": "left" + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 0 + }, + "hideTimeOverride": false, + "id": 1, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": null, + "expr": "sum(up{job=\"${kafka_connect_src_job}\", namespace=\"${namespace}\", drSinkInstance=\"\"})", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "", + "step": 10, + "target": "" + } + ], + "title": "KafkaConnect Source", + "transformations": [], + "transparent": false, + "type": "stat" + }, + { + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": null, + "mappings": [], + "max": 1, + "min": "0", + "noValue": "0", + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "red", + "index": 0, + "line": true, + "op": "gt", + "value": "null", + "yaxis": "left" + }, + { + "color": "yellow", + "index": 1, + "line": true, + "op": "gt", + "value": 50.0, + "yaxis": "left" + }, + { + "color": "green", + "index": 2, + "line": true, + "op": "gt", + "value": 100.0, + "yaxis": "left" + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 4, + "y": 0 + }, + "hideTimeOverride": false, + "id": 2, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": null, + "expr": "sum(up{job=\"${kafka_connect_sink_job}\", namespace=\"${namespace}\", drSinkInstance!=\"\"})", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "", + "step": 10, + "target": "" + } + ], + "title": "KafkaConnect Sink", + "transformations": [], + "transparent": false, + "type": "stat" + }, + { + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": null, + "mappings": [], + "max": 1, + "min": "0", + "noValue": "0", + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "red", + "index": 0, + "line": true, + "op": "gt", + "value": "null", + "yaxis": "left" + }, + { + "color": "yellow", + "index": 1, + "line": true, + "op": "gt", + "value": 50.0, + "yaxis": "left" + }, + { + "color": "green", + "index": 2, + "line": true, + "op": "gt", + "value": 100.0, + "yaxis": "left" + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 8, + "y": 0 + }, + "hideTimeOverride": false, + "id": 3, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": null, + "expr": "sum(up{job=\"${kafka_instance}\", namespace=\"${namespace}\"})", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "", + "step": 10, + "target": "" + } + ], + "title": "Kafka", + "transformations": [], + "transparent": false, + "type": "stat" + }, + { + "datasource": "${DS_PROMETHEUS}", + "description": "Time since the last eligible op on the primary site.", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": null, + "mappings": [], + "noValue": "-", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#808080", + "index": 0, + "line": true, + "op": "gt", + "value": "null", + "yaxis": "left" + }, + { + "color": "green", + "index": 1, + "line": true, + "op": "gt", + "value": 0.0, + "yaxis": "left" + }, + { + "color": "super-light-yellow", + "index": 2, + "line": true, + "op": "gt", + "value": 1800.0, + "yaxis": "left" + }, + { + "color": "orange", + "index": 3, + "line": true, + "op": "gt", + "value": 3600.0, + "yaxis": "left" + }, + { + "color": "red", + "index": 4, + "line": true, + "op": "gt", + "value": 3700.0, + "yaxis": "left" + } + ] + }, + "unit": "clockms" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 0 + }, + "hideTimeOverride": false, + "id": 4, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": null, + "expr": "time() - s3_lifecycle_last_timestamp_ms{location=~\"${locations}\", job=~\"${lifecycle_jobs}\", namespace=\"${namespace}\"}", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "", + "step": 10, + "target": "" + } + ], + "title": "Last lifecycle op", + "transformations": [], + "transparent": false, + "type": "stat" + }, + { + "datasource": "${DS_PROMETHEUS}", + "description": "Time since the last mongo DB write on DR site.", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": null, + "mappings": [], + "noValue": "-", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#808080", + "index": 0, + "line": true, + "op": "gt", + "value": "null", + "yaxis": "left" + }, + { + "color": "green", + "index": 1, + "line": true, + "op": "gt", + "value": 0.0, + "yaxis": "left" + }, + { + "color": "super-light-yellow", + "index": 2, + "line": true, + "op": "gt", + "value": 1800.0, + "yaxis": "left" + }, + { + "color": "orange", + "index": 3, + "line": true, + "op": "gt", + "value": 3600.0, + "yaxis": "left" + }, + { + "color": "red", + "index": 4, + "line": true, + "op": "gt", + "value": 3700.0, + "yaxis": "left" + } + ] + }, + "unit": "clockms" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 0 + }, + "hideTimeOverride": false, + "id": 5, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": null, + "expr": "time() - mongodb_ss_repl_lastWrite_lastWriteDate{job=~\"${mongo_jobs}\", namespace=\"${namespace}\", drSinkInstance!=\"\"}", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "", + "step": 10, + "target": "" + } + ], + "title": "Last DR mongo write", + "transformations": [], + "transparent": false, + "type": "stat" + }, + { + "collapsed": false, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 4 + }, + "hideTimeOverride": false, + "id": 6, + "links": [], + "maxDataPoints": 100, + "panels": [], + "targets": [], + "title": "Kafka Lag", + "transformations": [], + "transparent": false, + "type": "row" + }, + { + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 30, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "log": 2, + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": {}, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 5 + }, + "hideTimeOverride": false, + "id": 7, + "links": [], + "maxDataPoints": 100, + "options": { + "legend": { + "calcs": [ + "max", + "mean" + ], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "datasource": null, + "expr": "sum(kafka_consumer_fetch_manager_records_lag{namespace=\"${namespace}\", drSinkInstance=\"\", job=\"${kafka_connect_src_job}\"})", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": " ", + "metric": "", + "refId": "", + "step": 10, + "target": "" + } + ], + "title": "Kafka Connect Source Lag", + "transformations": [], + "transparent": false, + "type": "timeseries" + }, + { + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 30, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "log": 2, + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": {}, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 5 + }, + "hideTimeOverride": false, + "id": 8, + "links": [], + "maxDataPoints": 100, + "options": { + "legend": { + "calcs": [ + "max", + "mean" + ], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "datasource": null, + "expr": "max(kafka_consumergroup_group_max_lag{cluster_name=\"${kafka_instance}\", namespace=\"${namespace}\"})", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": " ", + "metric": "", + "refId": "", + "step": 10, + "target": "" + } + ], + "title": "Kafka Lag", + "transformations": [], + "transparent": false, + "type": "timeseries" + }, + { + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 30, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "log": 2, + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": {}, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 5 + }, + "hideTimeOverride": false, + "id": 9, + "links": [], + "maxDataPoints": 100, + "options": { + "legend": { + "calcs": [ + "max", + "mean" + ], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "datasource": null, + "expr": "sum(kafka_consumer_fetch_manager_records_lag{namespace=\"${namespace}\", drSinkInstance!=\"\", job=\"${kafka_connect_sink_job}\"})", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": " ", + "metric": "", + "refId": "", + "step": 10, + "target": "" + } + ], + "title": "Kafka Connect Sink Lag", + "transformations": [], + "transparent": false, + "type": "timeseries" + }, + { + "collapsed": false, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 13 + }, + "hideTimeOverride": false, + "id": 10, + "links": [], + "maxDataPoints": 100, + "panels": [], + "targets": [], + "title": "Processing rate", + "transformations": [], + "transparent": false, + "type": "row" + }, + { + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 30, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "log": 2, + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": {}, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 14 + }, + "hideTimeOverride": false, + "id": 11, + "links": [], + "maxDataPoints": 100, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "datasource": null, + "expr": "sum(rate(s3_lifecycle_duration_seconds_count{location=~\"${locations}\", job=~\"${lifecycle_jobs}\", namespace=\"${namespace}\", type=\"archive\"}[$__rate_interval]))", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": " ", + "metric": "", + "refId": "", + "step": 10, + "target": "" + } + ], + "title": "Lifecycle Archive Rate (source)", + "transformations": [], + "transparent": false, + "type": "timeseries" + }, + { + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 30, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "log": 2, + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": {}, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 14 + }, + "hideTimeOverride": false, + "id": 12, + "links": [], + "maxDataPoints": 100, + "options": { + "legend": { + "calcs": [ + "max", + "mean" + ], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "datasource": null, + "expr": "sum(rate(kafka_server_brokertopicmetrics_messagesin_total{job=\"${kafka_instance}\", namespace=\"${namespace}\", topic=\"\"}[$__rate_interval]))", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": " ", + "metric": "", + "refId": "", + "step": 10, + "target": "" + } + ], + "title": "Kafka Message Rate", + "transformations": [], + "transparent": false, + "type": "timeseries" + }, + { + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 30, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "log": 2, + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": {}, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 14 + }, + "hideTimeOverride": false, + "id": 13, + "links": [], + "maxDataPoints": 100, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "datasource": null, + "expr": "sum(rate(kafka_connect_mongodb_sink_task_metrics_in_task_put{job=\"${kafka_connect_sink_job}\", namespace=\"${namespace}\"}[$__rate_interval]))", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": " ", + "metric": "", + "refId": "", + "step": 10, + "target": "" + } + ], + "title": "Metadata Write Rate (sink)", + "transformations": [], + "transparent": false, + "type": "timeseries" + } + ], + "refresh": "30s", + "rows": [], + "schemaVersion": 12, + "sharedCrosshair": false, + "style": "dark", + "tags": [ + "Backeat" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "hidden": false, + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Zenko DR ${zenkoName}", + "uid": null, + "version": 1 +} diff --git a/monitoring/pra/dashboard.py b/monitoring/pra/dashboard.py new file mode 100644 index 0000000000..f3f16e9e77 --- /dev/null +++ b/monitoring/pra/dashboard.py @@ -0,0 +1,303 @@ +from grafanalib.core import ( + ConstantInput, + DataSourceInput, + RowPanel, + Threshold, +) + +from grafanalib import formatunits as UNITS +from scalgrafanalib import ( + layout, + metrics, + Dashboard, + Stat, + Target, + TimeSeries, +) + + +class Metrics: + UP = metrics.Metric( + "up", + "drSinkInstance", job="${job}", namespace="${namespace}", + ) + + KAFKA_CONSUMERGROUP_LAG = metrics.Metric( + "kafka_consumergroup_group_max_lag", + "group", cluster_name="${kafka_instance}", namespace="${namespace}", + ) + + KAFKA_CONNECT_SOURCE_TASK_RECORDS_LAG = metrics.Metric( + "kafka_consumer_fetch_manager_records_lag", + "drSinkInstance", "job", namespace="${namespace}", + ) + + LIFECYCLE_DURATION = metrics.BucketMetric( + "s3_lifecycle_duration_seconds", + "type", location=["${locations}"], job=["${lifecycle_jobs}"], namespace="${namespace}", + ) + + LIFECYCLE_LAST_TIMESTAMP = metrics.Metric( + "s3_lifecycle_last_timestamp_ms", + "type", location=["${locations}"], job=["${lifecycle_jobs}"], namespace="${namespace}", + ) + + KAFKA_MESSAGES_IN_TOTAL = metrics.CounterMetric( + "kafka_server_brokertopicmetrics_messagesin_total", + job="${kafka_instance}", namespace="${namespace}", topic="", # topic="" is the total + ) + + KAFKA_CONNECT_MONGODB_PUT_TOTAL = metrics.CounterMetric( + "kafka_connect_mongodb_sink_task_metrics_in_task_put", + "drSinkInstance", job="${kafka_connect_sink_job}", namespace="${namespace}", + ) + + MONGODB_OPLOG_START_TS, MONGODB_OPLOG_END_TS, MONGODB_LAST_WRITE_TS = [ + metrics.Metric( + name, + "drSinkInstance", job=["${mongo_jobs}"], namespace="${namespace}", + ) + for name in ( + "mongodb_oplog_stats_start", + "mongodb_oplog_stats_end", + "mongodb_ss_repl_lastWrite_lastWriteDate", + ) + ] + + +up = [ + Stat( + title=title, + dataSource="${DS_PROMETHEUS}", + reduceCalc="last", + minValue="0", + maxValue=replicas, + noValue="0", + targets=[ + Target( + expr='sum(' + metric + ')', + ), + ], + thresholdType="percentage", + thresholds=[ + Threshold("red", 0, 0.0), + Threshold("yellow", 1, 50.0), + Threshold("green", 2, 100.0), + ], + ) + for title, metric, replicas in [ + ["KafkaConnect Source", Metrics.UP('drSinkInstance=""', job="${kafka_connect_src_job}"), 1], + ["KafkaConnect Sink", Metrics.UP('drSinkInstance!=""', job="${kafka_connect_sink_job}"), 1], + ["Kafka", Metrics.UP('job="${kafka_instance}"'), 1], + ] +] + +lastLifecycle = Stat( + title="Last lifecycle op", + description="Time since the last eligible op on the primary site.", + dataSource="${DS_PROMETHEUS}", + format=UNITS.CLOCK_MSEC, + noValue="-", + reduceCalc="last", + targets=[ + Target( + expr="time() - " + Metrics.LIFECYCLE_LAST_TIMESTAMP(), + ) + ], + thresholds=[ + Threshold("#808080", 0, 0.0), + Threshold("green", 1, 0.0), + Threshold("super-light-yellow", 2, 1800.0), + Threshold("orange", 3, 3600.0), + Threshold("red", 4, 3700.0), + ], +) + +lastMongodbWrite = Stat( + title="Last DR mongo write", + description="Time since the last mongo DB write on DR site.", + dataSource="${DS_PROMETHEUS}", + format=UNITS.CLOCK_MSEC, + noValue="-", + reduceCalc="last", + targets=[ + Target( + expr="time() - " + Metrics.MONGODB_LAST_WRITE_TS('drSinkInstance!=""'), + ) + ], + thresholds=[ + Threshold("#808080", 0, 0.0), + Threshold("green", 1, 0.0), + Threshold("super-light-yellow", 2, 1800.0), + Threshold("orange", 3, 3600.0), + Threshold("red", 4, 3700.0), + ], +) + +kafka_lag = TimeSeries( + title="Kafka Lag", + dataSource="${DS_PROMETHEUS}", + fillOpacity=30, + legendDisplayMode="list", + legendCalcs=["max", "mean"], + unit=UNITS.SECONDS, + targets=[Target( + expr="max(" + Metrics.KAFKA_CONSUMERGROUP_LAG() + ")", + legendFormat=" ", + )], +) + + +kafka_connect_lag_source, kafka_connect_lag_sink = [ + TimeSeries( + title="Kafka Connect " + name + " Lag", + dataSource="${DS_PROMETHEUS}", + fillOpacity=30, + legendDisplayMode="list", + legendCalcs=["max", "mean"], + unit=UNITS.SECONDS, + targets=[Target( + expr="sum(" + metric + ")", + legendFormat=" ", + )], + ) + for name, metric in [ + ["Source", Metrics.KAFKA_CONNECT_SOURCE_TASK_RECORDS_LAG('drSinkInstance=""', job="${kafka_connect_src_job}")], + ["Sink", Metrics.KAFKA_CONNECT_SOURCE_TASK_RECORDS_LAG('drSinkInstance!=""', job="${kafka_connect_sink_job}")], + ] +] + + +lifecycle_archive_rate = TimeSeries( + title="Lifecycle Archive Rate (source)", + dataSource="${DS_PROMETHEUS}", + fillOpacity=30, + legendDisplayMode="list", + legendCalcs=["mean", "max"], + unit=UNITS.OPS_PER_SEC, + targets=[Target( + expr="sum(rate(" + Metrics.LIFECYCLE_DURATION.count(type="archive") + "))", + legendFormat=" ", + )], +) + + +incoming_message_rate = TimeSeries( + title="Kafka Message Rate", + dataSource="${DS_PROMETHEUS}", + fillOpacity=30, + legendDisplayMode="list", + legendCalcs=["max", "mean"], + unit=UNITS.OPS_PER_SEC, + targets=[Target( + expr="sum(rate(" + Metrics.KAFKA_MESSAGES_IN_TOTAL() + "))", + legendFormat=" ", + )], +) + + +outgoing_message_rate = TimeSeries( + title="Metadata Write Rate (sink)", + dataSource="${DS_PROMETHEUS}", + fillOpacity=30, + legendDisplayMode="list", + legendCalcs=["mean", "max"], + unit=UNITS.OPS_PER_SEC, + targets=[Target( + expr="sum(rate(" + Metrics.KAFKA_CONNECT_MONGODB_PUT_TOTAL() + "))", + legendFormat=" ", + )], +) + + +dashboard = ( + Dashboard( + title="Zenko DR ${zenkoName}", + editable=True, + refresh="30s", + tags=["Backeat"], + timezone="", + version=1, + inputs=[ + DataSourceInput( + name="DS_PROMETHEUS", + label="Prometheus", + description="Prometheus server that will be used for all panels in the dashboard.", + pluginId="prometheus", + pluginName="Prometheus", + ), + ConstantInput( + name="namespace", + label="namespace", + description="Namespace associated with the Zenko instance", + value="zenko", + ), + ConstantInput( + name="zenkoName", + label="zenko instance name", + description="Name of the ZenkoDR instance", + value="artesca-data", + ), + ConstantInput( + name="kafka_instance", + label="kafka instance", + description="Name of the kafka instance/job/cluster_name", + value="artesca-data-dr-base-queue", + ), + ConstantInput( + name="kafka_connect_src_job", + label="kafka connect source job", + description="Name of the kafka connect job", + value="artesca-data-dr-base-queue-connector-metrics", + ), + ConstantInput( + name="kafka_connect_sink_job", + label="kafka connect sink job", + description="Name of the kafka connect job", + value="artesca-data-dr-base-queue-connector-metrics", + ), + ConstantInput( + name="lifecycle_jobs", + label="Lifecycle jobs", + description="Promethes label expression for lifecycle jobs", + value="artesca-data-backbeat-lifecycle-.*-headless", + ), + ConstantInput( + name="mongo_jobs", + label="MongoDb jobs", + description="Promethes label expression for mongo jobs", + value="zenko/data-db-mongodb-sharded-shard.*", + ), + ConstantInput( + name="locations", + label="Locations", + description="Promethes label expression to filter PRA locations", + value="glacier", + ), + ConstantInput( + name="replicas", + label="Replicas", + description="Expected number of replicas", + value="1", + ), + ], + panels=layout.column([ + layout.row( + up + layout.resize([lastLifecycle, lastMongodbWrite], width=6,), + height=4, + ), + RowPanel(title="Kafka Lag"), + layout.row( + [kafka_connect_lag_source, kafka_lag, kafka_connect_lag_sink], + height=8, + ), + RowPanel(title="Processing rate"), + layout.row( + [lifecycle_archive_rate, incoming_message_rate, outgoing_message_rate], + height=8, + ), + ]), + ) + .auto_panel_ids() + .verify_datasources() +) \ No newline at end of file diff --git a/solution/zenkoversion.yaml b/solution/zenkoversion.yaml index 39f3799707..866918607f 100644 --- a/solution/zenkoversion.yaml +++ b/solution/zenkoversion.yaml @@ -39,6 +39,9 @@ spec: scuba: image: scuba-dashboards tag: '${SCUBA_TAG}' + dr: + image: dr-dashboards + tag: '${VERSION_FULL}' policies: backbeat: image: backbeat-policies