From c4f2cb7947a90d16b38d6ad52492bcbe33799a18 Mon Sep 17 00:00:00 2001 From: Andrew Date: Tue, 7 Jan 2025 14:10:21 -0800 Subject: [PATCH] user dash on gke --- grafana/konduktor_user_gke.json | 1705 +++++++++++++++++++++++++++++++ 1 file changed, 1705 insertions(+) create mode 100644 grafana/konduktor_user_gke.json diff --git a/grafana/konduktor_user_gke.json b/grafana/konduktor_user_gke.json new file mode 100644 index 0000000..49df10b --- /dev/null +++ b/grafana/konduktor_user_gke.json @@ -0,0 +1,1705 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + }, + { + "name": "DS_LOKI", + "label": "loki", + "description": "", + "type": "datasource", + "pluginId": "loki", + "pluginName": "Loki" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "11.3.1" + }, + { + "type": "panel", + "id": "logs", + "name": "Logs", + "version": "" + }, + { + "type": "datasource", + "id": "loki", + "name": "Loki", + "version": "1.0.0" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + }, + { + "type": "panel", + "id": "table", + "name": "Table", + "version": "" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "$$hashKey": "object:192", + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "This dashboard is to display running jobs and DCGM metrics from a Kubernetes cluster.", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 19, + "panels": [], + "title": "GPU Stats Done", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 6, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "maxHeight": 600, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "DCGM_FI_DEV_GPU_UTIL{gpu=~\"${gpu}\", exported_pod=~\"${pod}\"}\n* on (exported_pod) group_left(kube_pod_labels)\nlabel_replace(kube_pod_labels{namespace='default', label_kueue_x_k8s_io_pod_group_name=\"${workloads}\"}, \"exported_pod\", \"$1\", \"pod\", \"(.+)\")", + "interval": "", + "legendFormat": "GPU {{gpu}} {{exported_pod}}", + "range": true, + "refId": "A" + } + ], + "title": "GPU Utilization", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 36, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "maxHeight": 600, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "DCGM_FI_PROF_SM_ACTIVE{gpu=~\"${gpu}\", exported_pod=~\"${pod}\"} * 100\n* on (exported_pod) group_left(kube_pod_labels)\nlabel_replace(kube_pod_labels{namespace='default', label_kueue_x_k8s_io_pod_group_name=\"${workloads}\"}, \"exported_pod\", \"$1\", \"pod\", \"(.+)\")", + "interval": "", + "legendFormat": "GPU {{gpu}} {{exported_pod}}", + "range": true, + "refId": "A" + } + ], + "title": "GPU SM Efficiency", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "celsius" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 12, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "maxHeight": 600, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "DCGM_FI_DEV_GPU_TEMP{gpu=~\"${gpu}\", exported_pod=~\"${pod}\"}\n* on (exported_pod) group_left(kube_pod_labels)\nlabel_replace(kube_pod_labels{namespace='default', label_kueue_x_k8s_io_pod_group_name=\"${workloads}\"}, \"exported_pod\", \"$1\", \"pod\", \"(.+)\")", + "instant": false, + "interval": "", + "legendFormat": "GPU {{gpu}} {{exported_pod}}", + "refId": "A" + } + ], + "title": "GPU Temperature", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "watt" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 10, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "maxHeight": 600, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "DCGM_FI_DEV_POWER_USAGE{gpu=~\"${gpu}\", exported_pod=~\"${pod}\"}\n* on (exported_pod) group_left(kube_pod_labels)\nlabel_replace(kube_pod_labels{namespace='default', label_kueue_x_k8s_io_pod_group_name=\"${workloads}\"}, \"exported_pod\", \"$1\", \"pod\", \"(.+)\")", + "interval": "", + "legendFormat": "GPU {{gpu}} {{exported_pod}}", + "range": true, + "refId": "A" + } + ], + "title": "GPU Power Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 4, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "maxHeight": 600, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{gpu=~\"${gpu}\", exported_pod=~\"${pod}\"}\n* on (exported_pod) group_left(kube_pod_labels)\nlabel_replace(kube_pod_labels{namespace='default', label_kueue_x_k8s_io_pod_group_name=\"${workloads}\"}, \"exported_pod\", \"$1\", \"pod\", \"(.+)\")", + "interval": "", + "legendFormat": "GPU {{gpu}} {{exported_pod}}", + "range": true, + "refId": "A" + } + ], + "title": "Tensor Core Utilization", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 39, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Name", + "sortDesc": true + }, + "tooltip": { + "maxHeight": 600, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL{gpu=~\"${gpu}\", exported_pod=~\"${pod}\"} * 1e6\n* on (exported_pod) group_left(kube_pod_labels)\nlabel_replace(kube_pod_labels{namespace='default', label_kueue_x_k8s_io_pod_group_name=\"${workloads}\"}, \"exported_pod\", \"$1\", \"pod\", \"(.+)\")", + "interval": "", + "legendFormat": "GPU {{gpu}} {{exported_pod}}", + "range": true, + "refId": "A" + } + ], + "title": "NVLINK Throughput", + "type": "timeseries" + }, + { + "datasource": { + "type": "loki", + "uid": "${DS_LOKI}" + }, + "gridPos": { + "h": 34, + "w": 24, + "x": 0, + "y": 24 + }, + "id": 44, + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": false, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": false + }, + "pluginVersion": "11.3.1", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "${DS_LOKI}" + }, + "editorMode": "code", + "expr": "{k8s_namespace_name=\"default\", k8s_pod_name=~\"${pod}\"} | logfmt", + "legendFormat": "", + "queryType": "range", + "refId": "A" + } + ], + "title": "pod logs", + "type": "logs" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 58 + }, + "id": 37, + "panels": [], + "title": "Node Stats", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 100, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 59 + }, + "id": 33, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "maxHeight": 600, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum by (pod) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace='default', pod=~\"${pod}\"})", + "hide": false, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "CPU Utilisation", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "filterable": false, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "namespace" + }, + "properties": [ + { + "id": "custom.width", + "value": 176 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "label_kueue_x_k8s_io_pod_group_name" + }, + "properties": [ + { + "id": "custom.width", + "value": 406 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "workload" + }, + "properties": [ + { + "id": "custom.width", + "value": 112 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "# gpus" + }, + "properties": [ + { + "id": "custom.width", + "value": 101 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "# gpus" + }, + "properties": [ + { + "id": "unit", + "value": "none" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 59 + }, + "id": 30, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 0, + "showHeader": true, + "sortBy": [ + { + "desc": false, + "displayName": "# gpus" + } + ] + }, + "pluginVersion": "11.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "max by (label_kueue_x_k8s_io_pod_group_name, namespace) ((sum_over_time(kube_pod_status_phase{phase=\"Running\"}[7d]) * kube_pod_status_phase{phase=\"Running\"}\n + on(namespace, pod) group_left(label_kueue_x_k8s_io_pod_group_name)\n kube_pod_labels{label_kueue_x_k8s_io_pod_group_name=~\"(.+)\"} * 0\n) > 0) * 30", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum by (label_kueue_x_k8s_io_pod_group_name, namespace) ((\n kube_pod_container_resource_requests{resource=\"nvidia_com_gpu\"} \n * on(namespace, pod) group_left()\n kube_pod_status_phase{phase=\"Running\"}\n + on(namespace, pod) group_left(label_kueue_x_k8s_io_pod_group_name)\n kube_pod_labels{label_kueue_x_k8s_io_pod_group_name=~\"(.+)\"} * 0\n)) > 0", + "format": "table", + "hide": false, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "B" + } + ], + "title": "Active workloads (Kueue)", + "transformations": [ + { + "id": "joinByField", + "options": { + "byField": "label_kueue_x_k8s_io_pod_group_name", + "mode": "outer" + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time 1": true, + "Time 2": true, + "namespace 2": true + }, + "includeByName": {}, + "indexByName": { + "Time 1": 1, + "Time 2": 3, + "Value #A": 6, + "Value #B": 5, + "label_kueue_x_k8s_io_pod_group_name": 0, + "namespace 1": 2, + "namespace 2": 4 + }, + "renameByName": { + "Value #A": "Time Active", + "Value #B": "# gpus", + "label_kueue_x_k8s_io_pod_group_name": "workload" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 2, + "x": 12, + "y": 67 + }, + "id": 26, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "count(DCGM_FI_DEV_GPU_UTIL\n* on (exported_pod) group_left(kube_pod_labels)\nlabel_replace(kube_pod_labels{namespace='default', label_kueue_x_k8s_io_pod_group_name=\"${workloads}\"}, \"exported_pod\", \"$1\", \"pod\", \"(.+)\")) by (modelName)", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Total GPUs (Instant)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "decimals": 0, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "yellow", + "value": 0.8 + }, + { + "color": "red", + "value": 0.9 + } + ] + }, + "unit": "sishort" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "instance" + }, + "properties": [ + { + "id": "custom.width", + "value": 156 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Disk Total" + }, + "properties": [ + { + "id": "custom.width", + "value": 139 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Disk Available" + }, + "properties": [ + { + "id": "custom.width", + "value": 153 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Disk Used" + }, + "properties": [ + { + "id": "custom.width", + "value": 133 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Usage %" + }, + "properties": [ + { + "id": "unit", + "value": "percentunit" + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "gradient", + "type": "gauge", + "valueDisplayMode": "color" + } + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 71 + }, + "id": 38, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Usage %" + } + ] + }, + "pluginVersion": "11.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "max by (instance) (node_filesystem_size_bytes{job=\"node-exporter\", fstype!=\"\", mountpoint!=\"\"})\n", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "max by (instance) (node_filesystem_avail_bytes{job=\"node-exporter\", fstype!=\"\", mountpoint!=\"\"})\n", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "B" + } + ], + "title": "Disk Space Usage", + "transformations": [ + { + "disabled": true, + "id": "groupBy", + "options": { + "fields": { + "Value #A": { + "aggregations": [ + "lastNotNull" + ], + "operation": "aggregate" + }, + "Value #B": { + "aggregations": [ + "lastNotNull" + ], + "operation": "aggregate" + }, + "mountpoint": { + "aggregations": [], + "operation": "groupby" + } + } + } + }, + { + "id": "merge", + "options": {} + }, + { + "id": "calculateField", + "options": { + "alias": "Used", + "binary": { + "left": "Value #A", + "operator": "-", + "right": "Value #B" + }, + "mode": "binary", + "reduce": { + "reducer": "sum" + } + } + }, + { + "id": "calculateField", + "options": { + "alias": "Used %", + "binary": { + "left": "Used", + "operator": "/", + "right": "Value #A" + }, + "mode": "binary", + "reduce": { + "reducer": "sum" + } + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true + }, + "includeByName": {}, + "indexByName": {}, + "renameByName": { + "Time": "", + "Used": "Disk Used", + "Used %": "Usage %", + "Value #A": "Disk Total", + "Value #B": "Disk Available" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 78 + }, + "id": 2, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "maxHeight": 600, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum without (device) (\n rate(node_network_receive_bytes_total{job=\"node-exporter\", device!=\"lo\"}[5m])\n)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "instance {{instance}}", + "range": true, + "refId": "A" + } + ], + "title": "Network received bytes total", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 86 + }, + "id": 43, + "panels": [], + "title": "Error Logs", + "type": "row" + }, + { + "datasource": { + "type": "loki", + "uid": "${DS_LOKI}" + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 87 + }, + "id": 45, + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": false, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": false + }, + "pluginVersion": "11.3.1", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "${DS_LOKI}" + }, + "editorMode": "code", + "expr": "{k8s_daemonset_name=\"dmesg\"} |~ `(?i)NVRM: xid` or `(?i)error`", + "queryType": "range", + "refId": "A" + } + ], + "title": "dmesg errors", + "type": "logs" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 95 + }, + "id": 41, + "panels": [], + "title": "XID Errors", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "links": [], + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 96 + }, + "id": 42, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "DCGM_FI_DEV_XID_ERRORS{instance=~\"${instance}:9400\", gpu=~\"${gpu}\"}", + "instant": true, + "interval": "", + "legendFormat": "GPU {{gpu}} instance {{instance}}", + "range": false, + "refId": "A" + } + ], + "title": "XID Errors", + "type": "stat" + } + ], + "refresh": "", + "schemaVersion": 40, + "tags": [], + "templating": { + "list": [ + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "definition": "label_values(kube_pod_labels,label_kueue_x_k8s_io_pod_group_name)", + "includeAll": true, + "multi": true, + "name": "workloads", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(kube_pod_labels,label_kueue_x_k8s_io_pod_group_name)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "type": "query" + }, + { + "current": {}, + "definition": "label_values(gpu)", + "description": "", + "includeAll": true, + "multi": true, + "name": "gpu", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(gpu)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "type": "query" + }, + { + "current": {}, + "definition": "label_values(kube_pod_labels{namespace=\"default\", label_kueue_x_k8s_io_pod_group_name=\"$workloads\"},pod)", + "hide": 1, + "includeAll": true, + "label": "pod", + "multi": true, + "name": "pod", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(kube_pod_labels{namespace=\"default\", label_kueue_x_k8s_io_pod_group_name=\"$workloads\"},pod)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "type": "query" + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Konduktor - user (GKE)", + "uid": "Oxed_c6W5", + "version": 39, + "weekStart": "" +} \ No newline at end of file