Skip to content

Commit

Permalink
migrate: misc migrations across all dashboards
Browse files Browse the repository at this point in the history
  • Loading branch information
consideRatio committed Apr 7, 2024
1 parent 9f1c40a commit 1277787
Show file tree
Hide file tree
Showing 7 changed files with 88 additions and 103 deletions.
4 changes: 2 additions & 2 deletions dashboards/cluster.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -431,7 +431,7 @@ dashboard.new('Cluster Information')
[
row.new('Cluster Utilization')
+ row.withPanels([
userPods,
userPods, // FIXME: previously width 24
userNodes,
nodepoolMemoryCommitment,
nodepoolCPUCommitment,
Expand All @@ -450,6 +450,6 @@ dashboard.new('Cluster Information')
]),
],
panelWidth=12,
panelHeight=8,
panelHeight=10,
)
)
141 changes: 62 additions & 79 deletions dashboards/jupyterhub.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,7 @@ local currentActiveUsers =
|||
)
+ ts.standardOptions.withDecimals(0)
// stack=true,
+ ts.standardOptions.withMin(0)
// FIXME: not migrated config stack=true,
+ ts.queryOptions.withTargets([
prometheus.new(
'$PROMETHEUS_DS',
Expand All @@ -50,10 +49,9 @@ local dailyActiveUsers =
Requires JupyterHub 3.1.
|||,
)
// legend_hideZero=false,
// FIXME: not migrated config legend_hideZero=false,
+ ts.standardOptions.withDecimals(0)
// stack=true,
+ ts.standardOptions.withMin(0)
// FIXME: not migrated config stack=true,
+ ts.queryOptions.withTargets([
prometheus.new(
'$PROMETHEUS_DS',
Expand All @@ -76,10 +74,9 @@ local weeklyActiveUsers =
Requires JupyterHub 3.1.
|||
)
// legend_hideZero=false,
// FIXME: not migrated config legend_hideZero=false,
+ ts.standardOptions.withDecimals(0)
// stack=true,
+ ts.standardOptions.withMin(0)
// FIXME: not migrated config stack=true,
+ ts.queryOptions.withTargets([
prometheus.new(
'$PROMETHEUS_DS',
Expand All @@ -102,10 +99,9 @@ local monthlyActiveUsers =
Requires JupyterHub 3.1.
|||
)
// legend_hideZero=false,
// FIXME: not migrated config legend_hideZero=false,
+ ts.standardOptions.withDecimals(0)
// stack=true,
+ ts.standardOptions.withMin(0)
// FIXME: not migrated config stack=true,
+ ts.queryOptions.withTargets([
prometheus.new(
'$PROMETHEUS_DS',
Expand All @@ -121,11 +117,11 @@ local monthlyActiveUsers =
local userMemoryDistribution =
common.heatmapOptions
+ heatmap.new('User memory usage distribution')
// xBucketSize and interval must match to get correct values out of heatmaps
// xBucketSize='600s',
// yAxis_format='bytes',
// yAxis_min=0,
// color_colorScheme='interpolateViridis',
+ heatmap.standardOptions.withUnit('bytes')
+ heatmap.options.color.HeatmapColorOptions.withScheme('interpolateViridis')
+ heatmap.options.calculation.xBuckets.withMode('size')
+ heatmap.options.calculation.xBuckets.withValue('600s') // must align with interval
+ heatmap.queryOptions.withInterval('600s') // must align with xBuckets value
+ heatmap.queryOptions.withTargets([
prometheus.new(
'$PROMETHEUS_DS',
Expand All @@ -139,19 +135,18 @@ local userMemoryDistribution =
) by (pod)
|||
% jupyterhub.onComponentLabel('singleuser-server', group_left='container'),
),
// interval='600s',
// intervalFactor=1,
)
+ prometheus.withIntervalFactor(1),
]);

local userCPUDistribution =
common.heatmapOptions
+ heatmap.new('User CPU usage distribution')
// xBucketSize and interval must match to get correct values out of heatmaps
// xBucketSize='600s',
// yAxis_format='percentunit',
// yAxis_min=0,
// color_colorScheme='interpolateViridis',
+ heatmap.standardOptions.withUnit('percentunit')
+ heatmap.options.color.HeatmapColorOptions.withScheme('interpolateViridis')
+ heatmap.options.calculation.xBuckets.withMode('size')
+ heatmap.options.calculation.xBuckets.withValue('600s') // must align with interval
+ heatmap.queryOptions.withInterval('600s') // must align with xBuckets value
+ heatmap.queryOptions.withTargets([
prometheus.new(
'$PROMETHEUS_DS',
Expand All @@ -165,19 +160,18 @@ local userCPUDistribution =
) by (pod)
|||
% jupyterhub.onComponentLabel('singleuser-server', group_left='container'),
),
// interval='600s',
// intervalFactor=1,
)
+ prometheus.withIntervalFactor(1),
]);

local userAgeDistribution =
common.heatmapOptions
+ heatmap.new('User active age distribution')
// xBucketSize and interval must match to get correct values out of heatmaps
// xBucketSize='600s',
// yAxis_format='s',
// yAxis_min=0,
// color_colorScheme='interpolateViridis',
+ heatmap.standardOptions.withUnit('s')
+ heatmap.options.color.HeatmapColorOptions.withScheme('interpolateViridis')
+ heatmap.options.calculation.xBuckets.withMode('size')
+ heatmap.options.calculation.xBuckets.withValue('600s') // must align with interval
+ heatmap.queryOptions.withInterval('600s') // must align with xBuckets value
+ heatmap.queryOptions.withTargets([
prometheus.new(
'$PROMETHEUS_DS',
Expand All @@ -191,17 +185,15 @@ local userAgeDistribution =
)
|||
% jupyterhub.onComponentLabel('singleuser-server'),
),
// interval='600s',
// intervalFactor=1,
)
+ prometheus.withIntervalFactor(1),
]);

// Hub diagnostics
local hubResponseLatency =
common.tsOptions
+ ts.new('Hub response latency')
// formatY1='s',
+ ts.standardOptions.withMin(0)
+ ts.queryOptions.withTargets([
prometheus.new(
'$PROMETHEUS_DS',
Expand Down Expand Up @@ -265,7 +257,6 @@ local hubResponseLatency =
local hubResponseCodes =
common.tsOptions
+ ts.new('Hub response status codes')
+ ts.standardOptions.withMin(0)
+ ts.queryOptions.withTargets([
prometheus.new(
'$PROMETHEUS_DS',
Expand Down Expand Up @@ -297,7 +288,6 @@ local hubDBUsage =
|||
)
+ ts.standardOptions.withDecimals(0)
+ ts.standardOptions.withMin(0)
+ ts.standardOptions.withMax(1)
// formatY1='percentunit',
+ ts.queryOptions.withTargets([
Expand All @@ -319,7 +309,6 @@ local serverStartTimes =
+ ts.new('Server Start Times')
// formatY1='s',
// lines=false,
+ ts.standardOptions.withMin(0)
// points=true,
// pointradius=2,
+ ts.queryOptions.withTargets([
Expand All @@ -345,9 +334,8 @@ local serverSpawnFailures =
|||
)
// lines=false,
+ ts.standardOptions.withMin(0)
// points=false,
// legend_hideZero=true,
// FIXME: not migrated config legend_hideZero=true,
// bars=true,
// pointradius=2,
+ ts.queryOptions.withTargets([
Expand All @@ -364,7 +352,6 @@ local usersPerNode =
common.tsOptions
+ ts.new('Users per node')
+ ts.standardOptions.withDecimals(0)
+ ts.standardOptions.withMin(0)
+ ts.queryOptions.withTargets([
prometheus.new(
'$PROMETHEUS_DS',
Expand Down Expand Up @@ -393,8 +380,7 @@ local nonRunningPods =
|||
)
// decimalsY1=0,
+ ts.standardOptions.withMin(0)
// stack=true,
// FIXME: not migrated config stack=true,
+ ts.queryOptions.withTargets([
prometheus.new(
'$PROMETHEUS_DS',
Expand All @@ -421,7 +407,6 @@ local sharedVolumeFreeSpace =
|||
)
// decimalsY1=0,
+ ts.standardOptions.withMin(0)
+ ts.standardOptions.withMax(1)
// formatY1='percentunit',
+ ts.queryOptions.withTargets([
Expand All @@ -442,14 +427,13 @@ local sharedVolumeFreeSpace =
local oldUserpods =
common.tableOptions
+ table.new('Very old user pods')
+ ts.panelOptions.withDescription(
+ table.panelOptions.withDescription(
|||
User pods that have been running for a long time (>8h).
This often indicates problems with the idle culler
|||
)
// transform='timeseries_to_rows',
// styles=[
// {
// pattern: 'Value',
Expand All @@ -458,11 +442,12 @@ local oldUserpods =
// alias: 'Age',
// },
// ],
// sort={
// col: 2,
// desc: true,
// },
+ ts.queryOptions.withTargets([
+ table.options.withSortBy({
col: 2,
desc: true,
})
+ table.queryOptions.withTransformations('timeseries_to_rows')
+ table.queryOptions.withTargets([
prometheus.new(
'$PROMETHEUS_DS',
|||
Expand All @@ -475,20 +460,19 @@ local oldUserpods =
+ prometheus.withLegendFormat('{{ namespace }}/{{ pod }}'),
// instant=true
]);
// .hideColumn('Time')
// FIXME: not migrated config .hideColumn('Time')

local highCPUUserPods =
common.tableOptions
+ table.new('User Pods with high CPU usage (>0.5)')
+ ts.panelOptions.withDescription(
+ table.panelOptions.withDescription(
|||
User pods using a lot of CPU
This could indicate a runaway process consuming resources
unnecessarily.
|||
)
// transform='timeseries_to_rows',
// styles=[
// {
// pattern: 'Value',
Expand All @@ -497,11 +481,12 @@ local highCPUUserPods =
// alias: 'CPU usage',
// },
// ],
// sort={
// col: 2,
// desc: true,
// },
+ ts.queryOptions.withTargets([
+ table.options.withSortBy({
col: 2,
desc: true,
})
+ table.queryOptions.withTransformations('timeseries_to_rows')
+ table.queryOptions.withTargets([
prometheus.new(
'$PROMETHEUS_DS',
|||
Expand All @@ -515,19 +500,18 @@ local highCPUUserPods =
+ prometheus.withLegendFormat('{{ namespace }}/{{ pod }}'),
// instant=true
]);
// .hideColumn('Time')
// FIXME: not migrated config .hideColumn('Time')

local highMemoryUsagePods =
common.tableOptions
+ table.new('User pods with high memory usage (>80% of limit)')
+ ts.panelOptions.withDescription(
+ table.panelOptions.withDescription(
|||
User pods getting close to their memory limit
Once they hit their memory limit, user kernels will start dying.
|||
)
// transform='timeseries_to_rows',
// styles=[
// {
// pattern: 'Value',
Expand All @@ -536,11 +520,12 @@ local highMemoryUsagePods =
// alias: '% of mem limit consumed',
// },
// ],
// sort={
// col: 2,
// desc: true,
// },
+ ts.queryOptions.withTargets([
+ table.options.withSortBy({
col: 2,
desc: true,
})
+ table.queryOptions.withTransformations('timeseries_to_rows')
+ table.queryOptions.withTargets([
prometheus.new(
'$PROMETHEUS_DS',
|||
Expand All @@ -562,7 +547,7 @@ local highMemoryUsagePods =
+ prometheus.withLegendFormat('{{ namespace }}/{{ pod }}'),
// instant=true
]);
// .hideColumn('Time')
// FIXME: not migrated config .hideColumn('Time')

// Show images used by different users on the hub
local notebookImagesUsed =
Expand All @@ -573,10 +558,9 @@ local notebookImagesUsed =
Number of user servers using a container image.
|||
)
// legend_hideZero=false,
// FIXME: not migrated config legend_hideZero=false,
+ ts.standardOptions.withDecimals(0)
// stack=false,
+ ts.standardOptions.withMin(0)
// FIXME: not migrated config stack=false,
+ ts.queryOptions.withTargets([
prometheus.new(
'$PROMETHEUS_DS',
Expand Down Expand Up @@ -624,22 +608,21 @@ dashboard.new('JupyterHub Dashboard')
serverSpawnFailures,
hubResponseLatency,
hubResponseCodes,
allComponentsCPU, // FIXME: previously specified as, is it ok now? { h: standardDims.h * 1.5 },
allComponentsMemory, // FIXME: previously specified as, is it ok now? { h: standardDims.h * 1.5 },
allComponentsCPU, // FIXME: previous height 12
allComponentsMemory, // FIXME: previous height 12
hubDBUsage,
nonRunningPods,
usersPerNode,
sharedVolumeFreeSpace,
]),
row.new('Anomalous user pods')
+ row.withPanels([
oldUserpods, // FIXME: previously specified as, is it ok now? { h: standardDims.h * 1.5 },
highCPUUserPods, // FIXME: previously specified as, is it ok now? { h: standardDims.h * 1.5 },
highMemoryUsagePods, // FIXME: previously specified as, is it ok now? { h: standardDims.h * 1.5 },
oldUserpods, // FIXME: previous height 12
highCPUUserPods, // FIXME: previous height 12
highMemoryUsagePods, // FIXME: previous height 12
]),
],
// FIXME: panelWidth and panelHeight specified like cluster.jsonnet without visual check
panelWidth=12,
panelHeight=8,
panelHeight=10,
)
)
1 change: 1 addition & 0 deletions dashboards/jupyterhub.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ local prometheus = grafonnet.query.prometheus;
*/
componentResourcePanel(title, metric, component='', formatY1=null, decimalsY1=null, multi=false)::
ts.new(title)
// FIXME: not migrated config below commented out
//decimalsY1=decimalsY1,
//formatY1=formatY1,
// show legend as a table with current, avg, max values
Expand Down
Loading

0 comments on commit 1277787

Please sign in to comment.