From 6f10c85b7fbb0c87180db8f7cdae6b16a9e5e80c Mon Sep 17 00:00:00 2001 From: Maha Benzekri Date: Mon, 13 May 2024 19:45:10 +0200 Subject: [PATCH] S3UTILS-158: monitoring dashboards introduction (might need to be dropped until next PR) --- monitoring/dashboard.json | 211 ++++++++++++++++++++++++++++++++++++ monitoring/dashboard.py | 101 +++++++++++++++++ monitoring/requirements.txt | 2 + 3 files changed, 314 insertions(+) create mode 100644 monitoring/dashboard.json create mode 100644 monitoring/dashboard.py create mode 100644 monitoring/requirements.txt diff --git a/monitoring/dashboard.json b/monitoring/dashboard.json new file mode 100644 index 00000000..6c3cfbb3 --- /dev/null +++ b/monitoring/dashboard.json @@ -0,0 +1,211 @@ +{ + "__inputs": [ + { + "description": "", + "label": "Prometheus", + "name": "DS_PROMETHEUS", + "pluginId": "prometheus", + "pluginName": "Prometheus", + "type": "datasource" + }, + { + "description": "", + "label": "Loki", + "name": "DS_LOKI", + "pluginId": "loki", + "pluginName": "Loki", + "type": "datasource" + }, + { + "description": "Namespace associated with the Zenko instance", + "label": "namespace", + "name": "namespace", + "type": "constant", + "value": "zenko" + }, + { + "description": "Name of the S3utils job, used to filter the metrics.", + "label": "job", + "name": "job", + "type": "constant", + "value": "artesca-data-ops-count-items-metrics" + }, + { + "description": "Prefix of the cronjob pod name, used to filter only the cronjob instances.", + "label": "pod", + "name": "pod", + "type": "constant", + "value": "artesca-data-ops-count-items" + } + ], + "annotations": { + "list": [] + }, + "description": "", + "editable": true, + "gnetId": null, + "hideControls": false, + "id": null, + "links": [], + "panels": [ + { + "collapsed": false, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "hideTimeOverride": false, + "id": 1, + "links": [], + "maxDataPoints": 100, + "panels": [], + "targets": [], + "title": "Processing Duration", + "transformations": [], + "transparent": false, + "type": "row" + }, + { + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "log": 2, + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": 180000, + "stacking": {}, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 1 + }, + "hideTimeOverride": false, + "id": 2, + "links": [], + "maxDataPoints": 100, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "datasource": null, + "expr": "sum(rate(count_items_bucketProcessingDuration_count{namespace=\"${namespace}\", job=~\"${job}\"}[$__rate_interval])))", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{namespace}} - {{job}}", + "metric": "", + "refId": "", + "step": 10, + "target": "" + } + ], + "title": "bucket count duration", + "transformations": [], + "transparent": false, + "type": "timeseries" + } + ], + "refresh": "30s", + "rows": [], + "schemaVersion": 12, + "sharedCrosshair": false, + "style": "dark", + "tags": [ + "S3Utils" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "hidden": false, + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "S3Utils service", + "uid": null, + "version": 0 +} diff --git a/monitoring/dashboard.py b/monitoring/dashboard.py new file mode 100644 index 00000000..f479a9eb --- /dev/null +++ b/monitoring/dashboard.py @@ -0,0 +1,101 @@ +from grafanalib.core import ( + ConstantInput, + DataSourceInput, + Heatmap, + HeatmapColor, + HIDE_VARIABLE, + RowPanel, + Stat, + Template, + Templating, + Threshold, + YAxis, +) +from grafanalib import formatunits as UNITS +from scalgrafanalib import ( + layout, + BarGauge, + Dashboard, + GaugePanel, + PieChart, + Tooltip, + Target, + TimeSeries +) + +bucketCountDuration = TimeSeries( + title="bucket count duration", + dataSource="${DS_PROMETHEUS}", + lineInterpolation="smooth", + spanNulls=3*60*1000, + unit=UNITS.SECONDS, + targets=[Target( + expr='sum(rate(count_items_bucketProcessingDuration_count{namespace="${namespace}", job=~"${job}"}[$__rate_interval])))', + legendFormat='{{namespace}} - {{job}}' + )], +) + +consolidationDuration = TimeSeries( + title="consolidation duration", + dataSource="${DS_PROMETHEUS}", + lineInterpolation="smooth", + spanNulls=3*60*1000, + unit=UNITS.SECONDS, + targets=[Target( + expr='sum(rate(count_items_consolidationDuration_count{namespace="${namespace}", job=~"${job}"}[$__rate_interval]))', + legendFormat='{{namespace}} - {{job}}' + )], +) + + +dashboard = ( + Dashboard( + title="S3Utils service", + editable=True, + refresh="30s", + tags=["S3Utils"], + timezone="", + inputs=[ + DataSourceInput( + name="DS_PROMETHEUS", + label="Prometheus", + pluginId="prometheus", + pluginName="Prometheus", + ), + DataSourceInput( + name="DS_LOKI", + label="Loki", + pluginId="loki", + pluginName="Loki" + ), + ConstantInput( + name="namespace", + label="namespace", + description="Namespace associated with the Zenko instance", + value="zenko", + ), + ConstantInput( + name="job", + label="job", + description="Name of the S3utils job, used to filter the " + "metrics.", + value="artesca-data-ops-count-items-metrics", + ), + ConstantInput( + name="pod", + label="pod", + description="Prefix of the cronjob pod name, used to filter " + "only the cronjob instances.", + value="artesca-data-ops-count-items", + ), + ], + panels=layout.column([ + RowPanel(title="Processing Duration"), + layout.row([bucketCountDuration], height=8), + ]), + ) + .auto_panel_ids() + .verify_datasources() +) + + diff --git a/monitoring/requirements.txt b/monitoring/requirements.txt new file mode 100644 index 00000000..f6925cee --- /dev/null +++ b/monitoring/requirements.txt @@ -0,0 +1,2 @@ +attrs==21.4.0 +grafanalib==0.6.3 \ No newline at end of file