diff --git a/docs/source/usage/priorities.rst b/docs/source/usage/priorities.rst index dd3bcf4..a38e398 100644 --- a/docs/source/usage/priorities.rst +++ b/docs/source/usage/priorities.rst @@ -44,7 +44,7 @@ and now you can launch the request .. code-block:: console # launch a low priority task - $ sky launch -y -d -c low task.yaml + $ sky launch -y -d -c low low.yaml # list workloads in kueue diff --git a/grafana/konduktor_user_gke.json b/grafana/konduktor_user_gke.json index eb76904..3b98c7b 100644 --- a/grafana/konduktor_user_gke.json +++ b/grafana/konduktor_user_gke.json @@ -23,7 +23,7 @@ "type": "grafana", "id": "grafana", "name": "Grafana", - "version": "11.3.1" + "version": "11.4.0" }, { "type": "panel", @@ -187,7 +187,7 @@ "sort": "none" } }, - "pluginVersion": "11.3.1", + "pluginVersion": "11.4.0", "targets": [ { "datasource": { @@ -293,7 +293,7 @@ "sort": "none" } }, - "pluginVersion": "11.3.1", + "pluginVersion": "11.4.0", "targets": [ { "datasource": { @@ -397,7 +397,7 @@ "sort": "none" } }, - "pluginVersion": "11.3.1", + "pluginVersion": "11.4.0", "targets": [ { "datasource": { @@ -501,7 +501,7 @@ "sort": "none" } }, - "pluginVersion": "11.3.1", + "pluginVersion": "11.4.0", "targets": [ { "datasource": { @@ -607,7 +607,7 @@ "sort": "none" } }, - "pluginVersion": "11.3.1", + "pluginVersion": "11.4.0", "targets": [ { "datasource": { @@ -713,7 +713,7 @@ "sort": "none" } }, - "pluginVersion": "11.3.1", + "pluginVersion": "11.4.0", "targets": [ { "datasource": { @@ -731,57 +731,228 @@ "title": "NVLINK Throughput", "type": "timeseries" }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 24 + }, + "id": 37, + "panels": [], + "title": "Node Stats", + "type": "row" + }, { "datasource": { - "type": "loki", - "uid": "${DS_LOKI}" + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] }, "gridPos": { - "h": 34, + "h": 8, "w": 24, "x": 0, - "y": 24 + "y": 25 }, - "id": 44, + "id": 33, "options": { - "dedupStrategy": "none", - "enableLogDetails": true, - "prettifyLogMessage": false, - "showCommonLabels": false, - "showLabels": false, - "showTime": true, - "sortOrder": "Descending", - "wrapLogMessage": false + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "maxHeight": 600, + "mode": "multi", + "sort": "none" + } }, - "pluginVersion": "11.3.1", + "pluginVersion": "11.4.0", "targets": [ { "datasource": { - "type": "loki", - "uid": "${DS_LOKI}" + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "{k8s_namespace_name=\"default\", k8s_pod_name=~\"${pod}\"} | logfmt", - "legendFormat": "", - "queryType": "range", + "expr": "sum by (exported_pod, node) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace='default', pod=~\"${pod}\"} * on(pod) group_left(exported_pod) label_replace(kube_pod_labels{namespace='default', pod=~\"${pod}\", label_kueue_x_k8s_io_pod_group_name=~\"${workloads}\"}, \"exported_pod\", \"$1\", \"pod\", \"(.+)\")) * 100 / on(node) group_left kube_node_status_capacity{resource='cpu'}", + "interval": "", + "legendFormat": "{{exported_pod}} ON {{node}}", + "range": true, "refId": "A" } ], - "title": "pod logs", - "type": "logs" + "title": "CPU Utilization (By Pod)", + "type": "timeseries" }, { - "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, "gridPos": { - "h": 1, + "h": 8, "w": 24, "x": 0, - "y": 58 + "y": 33 }, - "id": 37, - "panels": [], - "title": "Node Stats", - "type": "row" + "id": 34, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "maxHeight": 600, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum by (exported_pod, node) (container_memory_working_set_bytes{namespace='default', pod=~\"${pod}\"} * on(pod) group_left(exported_pod) label_replace(kube_pod_labels{namespace='default', pod=~\"${pod}\", label_kueue_x_k8s_io_pod_group_name=~\"${workloads}\"}, \"exported_pod\", \"$1\", \"pod\", \"(.+)\")) * 100 / on(node) group_left kube_node_status_capacity{resource='memory'}", + "interval": "", + "legendFormat": "{{exported_pod}} ON {{node}}", + "range": true, + "refId": "A" + } + ], + "title": "CPU Memory (By Pod)", + "type": "timeseries" }, { "datasource": { @@ -802,7 +973,7 @@ "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", - "fillOpacity": 100, + "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, @@ -811,10 +982,111 @@ }, "insertNulls": false, "lineInterpolation": "linear", - "lineStyle": { - "fill": "solid" + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" }, - "lineWidth": 1, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 41 + }, + "id": 48, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "maxHeight": 600, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "kube_pod_labels{namespace=\"default\", pod=~\"${pod}\", label_kueue_x_k8s_io_pod_group_name=~\"${workloads}\"} * on(pod) group_left(node, host_ip) kube_pod_info * on(host_ip) group_left(instance) label_replace(100 * (1 - (node_filesystem_avail_bytes{job=\"node-exporter\", fstype!=\"\", mountpoint=\"/home\"} / node_filesystem_size_bytes{job=\"node-exporter\", fstype!=\"\", mountpoint=\"/home\"})), \"host_ip\", \"$1\", \"instance\", \"(.*):.*\")", + "legendFormat": "{{pod}} ON {{node}}", + "range": true, + "refId": "A" + } + ], + "title": "Node Disk Usage (Whole Node)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" @@ -829,13 +1101,15 @@ "mode": "off" } }, + "links": [], "mappings": [], + "max": 100, + "min": 0, "thresholds": { "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -843,31 +1117,35 @@ } ] }, - "unit": "percentunit" + "unit": "percent" }, "overrides": [] }, "gridPos": { - "h": 12, - "w": 12, + "h": 8, + "w": 24, "x": 0, - "y": 59 + "y": 49 }, - "id": 33, + "id": 47, "options": { "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": false + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true }, "tooltip": { "maxHeight": 600, "mode": "multi", - "sort": "desc" + "sort": "none" } }, - "pluginVersion": "11.3.1", + "pluginVersion": "11.4.0", "targets": [ { "datasource": { @@ -875,15 +1153,13 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum by (pod) (node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace='default', pod=~\"${pod}\"})", - "hide": false, - "instant": false, - "legendFormat": "__auto", + "expr": "kube_pod_labels{namespace='default', pod=~\"${pod}\", label_kueue_x_k8s_io_pod_group_name=~\"${workloads}\"} * on(pod) group_left(node, host_ip) kube_pod_info * on(host_ip) group_left(instance) label_replace(100 * (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes), \"host_ip\", \"$1\", \"instance\", \"(.*):.*\")", + "legendFormat": "{{pod}} ON {{node}}", "range": true, "refId": "A" } ], - "title": "CPU Utilisation", + "title": "Node RAM Usage (Whole Node)", "type": "timeseries" }, { @@ -909,8 +1185,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -987,7 +1262,7 @@ "h": 8, "w": 12, "x": 12, - "y": 59 + "y": 57 }, "id": 30, "options": { @@ -1009,7 +1284,7 @@ } ] }, - "pluginVersion": "11.3.1", + "pluginVersion": "11.4.0", "targets": [ { "datasource": { @@ -1080,208 +1355,94 @@ }, { "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] + "type": "loki", + "uid": "${DS_LOKI}" }, "gridPos": { - "h": 4, - "w": 2, - "x": 12, - "y": 67 + "h": 34, + "w": 24, + "x": 0, + "y": 65 }, - "id": 26, + "id": 44, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.3.1", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "code", - "exemplar": false, - "expr": "count(DCGM_FI_DEV_GPU_UTIL\n* on (exported_pod) group_left(kube_pod_labels)\nlabel_replace(kube_pod_labels{namespace='default', label_kueue_x_k8s_io_pod_group_name=\"${workloads}\"}, \"exported_pod\", \"$1\", \"pod\", \"(.+)\")) by (modelName)", - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "A" - } - ], - "title": "Total GPUs (Instant)", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "fieldConfig": { - "defaults": { - "custom": { - "align": "auto", - "cellOptions": { - "type": "auto" - }, - "inspect": false - }, - "decimals": 0, - "mappings": [], - "max": 1, - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 0.8 - }, - { - "color": "red", - "value": 0.9 - } - ] - }, - "unit": "sishort" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "instance" - }, - "properties": [ - { - "id": "custom.width", - "value": 156 - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Disk Total" - }, - "properties": [ - { - "id": "custom.width", - "value": 139 - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Disk Available" - }, - "properties": [ - { - "id": "custom.width", - "value": 153 - } - ] + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": false, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": false + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "${DS_LOKI}" }, - { - "matcher": { - "id": "byName", - "options": "Disk Used" - }, - "properties": [ - { - "id": "custom.width", - "value": 133 - } - ] + "editorMode": "code", + "expr": "{k8s_namespace_name=\"default\", k8s_pod_name=~\"${pod}\"} | logfmt", + "legendFormat": "", + "queryType": "range", + "refId": "A" + } + ], + "title": "pod logs", + "type": "logs" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" }, - { - "matcher": { - "id": "byName", - "options": "Usage %" - }, - "properties": [ + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ { - "id": "unit", - "value": "percentunit" + "color": "green" }, { - "id": "custom.cellOptions", - "value": { - "mode": "gradient", - "type": "gauge", - "valueDisplayMode": "color" - } + "color": "red", + "value": 80 } ] } - ] + }, + "overrides": [] }, "gridPos": { - "h": 7, - "w": 24, - "x": 0, - "y": 71 + "h": 4, + "w": 2, + "x": 12, + "y": 99 }, - "id": 38, + "id": 26, "options": { - "cellHeight": "sm", - "footer": { - "countRows": false, - "fields": "", - "reducer": [ - "sum" + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" ], - "show": false + "fields": "", + "values": false }, - "showHeader": true, - "sortBy": [ - { - "desc": true, - "displayName": "Usage %" - } - ] + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true }, - "pluginVersion": "11.3.1", + "pluginVersion": "11.4.0", "targets": [ { "datasource": { @@ -1289,106 +1450,16 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "max by (instance) (node_filesystem_size_bytes{job=\"node-exporter\", fstype!=\"\", mountpoint!=\"\"})\n", - "format": "table", + "exemplar": false, + "expr": "count(DCGM_FI_DEV_GPU_UTIL\n* on (exported_pod) group_left(kube_pod_labels)\nlabel_replace(kube_pod_labels{namespace='default', label_kueue_x_k8s_io_pod_group_name=\"${workloads}\"}, \"exported_pod\", \"$1\", \"pod\", \"(.+)\")) by (modelName)", "instant": true, - "intervalFactor": 2, - "legendFormat": "", + "legendFormat": "__auto", + "range": false, "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "code", - "expr": "max by (instance) (node_filesystem_avail_bytes{job=\"node-exporter\", fstype!=\"\", mountpoint!=\"\"})\n", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "B" - } - ], - "title": "Disk Space Usage", - "transformations": [ - { - "disabled": true, - "id": "groupBy", - "options": { - "fields": { - "Value #A": { - "aggregations": [ - "lastNotNull" - ], - "operation": "aggregate" - }, - "Value #B": { - "aggregations": [ - "lastNotNull" - ], - "operation": "aggregate" - }, - "mountpoint": { - "aggregations": [], - "operation": "groupby" - } - } - } - }, - { - "id": "merge", - "options": {} - }, - { - "id": "calculateField", - "options": { - "alias": "Used", - "binary": { - "left": "Value #A", - "operator": "-", - "right": "Value #B" - }, - "mode": "binary", - "reduce": { - "reducer": "sum" - } - } - }, - { - "id": "calculateField", - "options": { - "alias": "Used %", - "binary": { - "left": "Used", - "operator": "/", - "right": "Value #A" - }, - "mode": "binary", - "reduce": { - "reducer": "sum" - } - } - }, - { - "id": "organize", - "options": { - "excludeByName": { - "Time": true - }, - "includeByName": {}, - "indexByName": {}, - "renameByName": { - "Time": "", - "Used": "Disk Used", - "Used %": "Usage %", - "Value #A": "Disk Total", - "Value #B": "Disk Available" - } - } } ], - "type": "table" + "title": "Total GPUs (Instant)", + "type": "stat" }, { "datasource": { @@ -1439,8 +1510,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1456,7 +1526,7 @@ "h": 8, "w": 12, "x": 12, - "y": 78 + "y": 103 }, "id": 2, "options": { @@ -1476,7 +1546,7 @@ "sort": "none" } }, - "pluginVersion": "11.3.1", + "pluginVersion": "11.4.0", "targets": [ { "datasource": { @@ -1497,143 +1567,148 @@ "type": "timeseries" }, { - "collapsed": false, + "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 86 + "y": 111 }, "id": 43, - "panels": [], - "title": "Error Logs", - "type": "row" - }, - { - "datasource": { - "type": "loki", - "uid": "${DS_LOKI}" - }, - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 87 - }, - "id": 45, - "options": { - "dedupStrategy": "none", - "enableLogDetails": true, - "prettifyLogMessage": false, - "showCommonLabels": false, - "showLabels": false, - "showTime": true, - "sortOrder": "Descending", - "wrapLogMessage": false - }, - "pluginVersion": "11.3.1", - "targets": [ + "panels": [ { "datasource": { "type": "loki", "uid": "${DS_LOKI}" }, - "editorMode": "code", - "expr": "{k8s_daemonset_name=\"dmesg\"} |~ `(?i)NVRM: xid` or `(?i)error`", - "queryType": "range", - "refId": "A" + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 206 + }, + "id": 45, + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": false, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": false + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "${DS_LOKI}" + }, + "editorMode": "code", + "expr": "{k8s_daemonset_name=\"dmesg\"} |~ `(?i)NVRM: xid` or `(?i)error`", + "queryType": "range", + "refId": "A" + } + ], + "title": "dmesg errors", + "type": "logs" } ], - "title": "dmesg errors", - "type": "logs" + "title": "Error Logs", + "type": "row" }, { - "collapsed": false, + "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 95 + "y": 112 }, "id": 41, - "panels": [], - "title": "XID Errors", - "type": "row" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "links": [], - "mappings": [], - "max": 1, - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "none" - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 24, - "x": 0, - "y": 96 - }, - "id": 42, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.3.1", - "targets": [ + "panels": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "editorMode": "code", - "exemplar": false, - "expr": "DCGM_FI_DEV_XID_ERRORS{instance=~\"${instance}:9400\", gpu=~\"${gpu}\"}", - "instant": true, - "interval": "", - "legendFormat": "GPU {{gpu}} instance {{instance}}", - "range": false, - "refId": "A" + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "links": [], + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 215 + }, + "id": 42, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "DCGM_FI_DEV_XID_ERRORS{instance=~\"${instance}:9400\", gpu=~\"${gpu}\"}", + "instant": true, + "interval": "", + "legendFormat": "GPU {{gpu}} instance {{instance}}", + "range": false, + "refId": "A" + } + ], + "title": "XID Errors", + "type": "stat" } ], "title": "XID Errors", - "type": "stat" + "type": "row" } ], "refresh": "1m", @@ -1681,9 +1756,7 @@ { "current": {}, "definition": "label_values(kube_pod_labels{namespace=\"default\", label_kueue_x_k8s_io_pod_group_name=~\"$workloads\"},pod)", - "hide": 1, "includeAll": true, - "label": "pod", "multi": true, "name": "pod", "options": [], @@ -1699,13 +1772,13 @@ ] }, "time": { - "from": "2025-01-07T02:00:00.000Z", - "to": "2025-01-07T07:59:59.000Z" + "from": "now-5m", + "to": "now" }, "timepicker": {}, "timezone": "", "title": "Konduktor - user (GKE)", "uid": "Oxed_c6W5", - "version": 49, + "version": 36, "weekStart": "" } \ No newline at end of file diff --git a/manifests/kube-prometheus-stack.values b/manifests/kube-prometheus-stack.values index eff398f..8e322f4 100644 --- a/manifests/kube-prometheus-stack.values +++ b/manifests/kube-prometheus-stack.values @@ -1001,6 +1001,14 @@ grafana: value: Prometheus - name: DS_LOKI value: loki + konduktor-user-gke-dashboard: + url: https://raw.githubusercontent.com/Trainy-ai/konduktor/main/grafana/konduktor_user_gke.json + datasource: + - name: DS_PROMETHEUS + value: Prometheus + - name: DS_LOKI + value: loki + dashboardProviders: dashboardproviders.yaml: @@ -3805,6 +3813,7 @@ prometheus: storageClassName: standard-rwo # storageClassName: local-path # for k3s # storageClassName: gp2 # for aws + # storageClassName: compute-csi-default-sc # for nebius accessModes: ["ReadWriteOnce"] resources: requests: diff --git a/start_dashboard.sh b/start_dashboard.sh old mode 100644 new mode 100755 index 064f8e9..760341e --- a/start_dashboard.sh +++ b/start_dashboard.sh @@ -29,7 +29,7 @@ else echo "Waiting for Grafana deployment to be available..." kubectl wait --for=condition=available deployment/kube-prometheus-stack-grafana -n prometheus --timeout=120s - # Wait 20 seconds to ensure pods are up and ready + # Wait 10 seconds to ensure pods are up and ready echo "Waiting for pods to finish setup..." sleep 10 fi