forked from OleHolmNielsen/Slurm_tools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathshowpartitions
executable file
·307 lines (286 loc) · 9.87 KB
/
showpartitions
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
#!/usr/bin/env bash
# Show the Slurm partitions statistics
# Author: Ole H. Nielsen, Technical University of Denmark
# E-mail: [email protected]
# Home page: https://github.com/OleHolmNielsen/Slurm_tools
# Command usage:
function usage()
{
cat <<EOF
Usage: $0 [-p partition-list] [-g] [-m] [-a] [-f] [-h]
where:
-p partition-list: Print only jobs in partition(s) <partition-list>
-g: Print also GRES information
-m: Print minimum and maximum values for memory and cores/node.
-a: Display information about all partitions including hidden ones.
-f: Show all partitions from the federation if a member of one. Only Slurm 18.08 and newer.
-h: Print this help information
Notes about the columns:
1. An * after the partition name identifies the default Slurm partition.
2. An @ after the partition state means that some nodes are pending a reboot.
3. An $ after the partition state means that some nodes are in maintenance mode.
4. An R after the partition name identifies a root-only Slurm partition.
5. An H after the partition name identifies a hidden Slurm partition.
EOF
}
# sinfo output options:
# PARTITION AVAIL NODES CPUS(A/I/O/T) MEMORY TIMELIMIT DEFAULTTIME JOB_SIZE NODES STATE GRES ROOT
export sinfo_options="%P %a %D %C %m %l %L %s %T %G %r"
# Printing of GRES is disabled by default
export printgres=0
# Printing of federation clusters is disabled by default
export federation=""
# By default sinfo does not show hidden and unavailable partitions
export all_partitions=""
# Print both min and max values (disabled by default)
export minmax=0
# Enable colors in output
export colors=1
while getopts "p:gmafh" options; do
case $options in
p ) export partitionlist="-p $OPTARG"
echo "Print only jobs in partition $OPTARG"
;;
g ) export printgres=1
;;
m ) export minmax=1
;;
a ) export all_partitions="--all"
;;
f ) export federation="--federation"
# Append sinfo option %V (CLUSTER) for Slurm 18.08 and newer.
export sinfo_options="$sinfo_options %V"
;;
h|? ) usage
exit 1;;
* ) usage
exit 1;;
esac
done
# Test for extraneous command line arguments
if test $# -gt $(($OPTIND-1))
then
echo "ERROR: Too many command line arguments: $*"
usage
exit 1
fi
export mycluster=`scontrol show config | grep ClusterName | awk '{print $3}'`
echo "Partition statistics for cluster $mycluster at `date`"
# Identify any hidden partitions (see slurm.conf man-page)
export hidden_partitions="`diff <(sinfo --hide -o %P) <(sinfo --all -o %P) | awk '$1==">" {printf("%s ", $2)}'`"
sinfo --noheader --exact $federation $partitionlist $all_partitions -o "$sinfo_options" | awk '
BEGIN {
# Environment variables
printgres = ENVIRON["printgres"]
minmax = ENVIRON["minmax"]
colors = ENVIRON["colors"]
partitionlist = ENVIRON["partitionlist"]
hidden_partitions = ENVIRON["hidden_partitions"]
mycluster = ENVIRON["mycluster"]
federation = ENVIRON["federation"]
clusternamelength = 7 # Minimum length of cluster name column
# Identify hidden partitions
if (split(hidden_partitions, array, " ") > 0)
for (i in array)
hidden[array[i]] = 1
delete array
# Get the list of all pending jobs
JOBLIST = "squeue --noheader -t pending -O JobID,Partition,NumCPUs,Reason " federation partitionlist
while ((JOBLIST | getline) > 0) {
split($2, jobpartitions, ",") # Job partitions (may be a comma-separated list)
numcpus = $3
reason = $4
for (i in jobpartitions) {
p = jobpartitions[i]
if (reason == "(Resources)" || reason == "(Priority)")
pending_resources[p] += numcpus
else
pending_other[p] += numcpus
}
}
close (JOBLIST)
delete jobpartitions
# Define terminal colors for the output if requested
if (colors > 0) {
# See http://en.wikipedia.org/wiki/ANSI_escape_code#Colors
RED="\033[1;31m"
GREEN="\033[1;32m"
MAGENTA="\033[1;35m"
NORMAL="\033[0m"
}
}
{
# Partitions
isdefault = sub("*", "", $1) # Strip trailing * for default partition
p = $1 # Partition name
partition[p] = p # Partition name
len = length(p) # Length of partition name string
if (isdefault > 0) { # The default partition
defaultpartition[p] = 1
len++ # Add 1 character to length
}
if ($11 == "yes") { # Only user root may initiate jobs, "yes" or "no"
root_only[p] = 1
len++ # Add 1 character to length
}
if (hidden[p] > 0)
len++ # Add 1 character to length
if (defaultpartition[p] > 0 || root_only[p] > 0 || hidden[p] > 0)
len++ # Add room for a :
if (len > maxlength) maxlength = len # Calculate maximum string length
part_order[p] = NR # Maintain Slurm ordering (index) of partitions
state[p] = $2 # Partition state: up or down
nodes[p] += $3 # Number of nodes in partition
# CPU cores
split($4, cpus, "/") # Split CPU fields A/I/O/T in $4
freecores[p] += cpus[2]
totalcores[p] += cpus[4]
cpn = cpus[4] / $3
if (corespernode[p] == 0 || cpn < corespernode[p]) # Select the lowest count of cores per node
corespernode[p] = cpn
if (minmax > 0) {
# Save min and max core count
if (mincores[p] == 0 || cpn < mincores[p])
mincores[p] = cpn
if (cpn > maxcores[p])
maxcores[p] = cpn
}
# RAM memory
mem = $5 # Node memory
n = sub("+", "", mem) # Strip trailing +
if (n > 0)
memoryplus[p] = "+"
else
if (memoryplus[p] == "")
memoryplus[p] = " " # Only overwrite empty string
mem = int(mem / 1000) # Convert MB to GB
if (memory[p] == 0 || mem < memory[p])
memory[p] = mem # Save the minimum memory size
if (mem > memory[p])
memoryplus[p] = "+" # Some memory is larger than the minimum size
if (minmax > 0) {
# Save min and max memory sizes
if (minmemory[p] == 0 || mem < minmemory[p])
minmemory[p] = mem
if (mem > maxmemory[p])
maxmemory[p] = mem
}
# Time limits
gsub(":00$", "", $6) # Strip time limit seconds :00
timelimit[p] = $6
gsub(":00$", "", $7) # Strip time limit seconds :00
defaulttime[p] = $7
# Job sizes
split($8, jobsize, "-") # Job size min-max nodes
minnodes[p] = jobsize[1]
maxnodes[p] = jobsize[2]
# Node states
nodestate = $9 # Node state
n = sub("@", "", nodestate)
if (n > 0) pending_reboot[p] = "@" # Nodes pending a reboot
if (index(nodestate, "maint") > 0) maintenance[p] = "$" # Nodes in maintenance mode
if (nodestate == "idle")
freenodes[p] += $3
# GRES (Generic Resources)
if ($10 != "(null)") { # Node GRES
if (nodestate == "idle")
gpustate = ":free"
else if (nodestate == "mixed")
gpustate = ":mix"
else
gpustate = ":used"
if (gres[p] == "")
gres[p] = $10 "(" $3 gpustate ")"
else
gres[p] = gres[p] "+" $10 "(" $3 gpustate ")"
}
# Federations (from Slurm 18.08)
if ($12 == "N/A") { # Not a federation of clusters
clustername[p] = mycluster # Default cluster name
} else
clustername[p] = $12 # Cluster name in a federation
n = length(clustername[p])
if (n > clusternamelength) clusternamelength = n # Max clustername string length
} END {
# Partition column output format string:
# The format assumes <1000k cores (6 digits) and <100k nodes (5 digits), but this can be adjusted
columnfmt = "%*s %5s %5d %s%5d%s %6d %s%6d%s %s%6d%s %6d %5.5s %5.5s %10s %5.5s %8s"
# Column header lines
header1 = "Partition #Nodes #CPU_cores Cores_pending Job_Nodes MaxJobTime Cores Mem/Node"
header2 = "Name State Total Idle Total Idle Resorc Other Min Max Day-hr:mn /node (GB)"
# Prepend some spaces for partition name length (shift by 5 characters)
n = maxlength - 5
header1 = sprintf("%*.*s %s", n, n, " ", header1)
header2 = sprintf("%*.*s %s", n, n, " ", header2)
if (federation != "") { # Prepend cluster name header
n = clusternamelength
header1 = sprintf("%*.*s %s", n, n, "Cluster", header1)
header2 = sprintf("%*.*s %s", n, n, "Name", header2)
}
if (printgres > 0) { # Append GRES header
header1 = header1 " GRES "
header2 = header2 " (#Nodes:state)"
}
# Print the header lines
printf("%s\n", header1)
printf("%s\n", header2)
# Sort arrays by element values:
# https://www.gnu.org/software/gawk/manual/html_node/Controlling-Scanning.html
PROCINFO["sorted_in"] = "@val_type_asc"
for (p in part_order) {
pname = p # Partition name
# Append partition flags
if (defaultpartition[p] > 0 || root_only[p] > 0 || hidden[p] > 0)
pname = pname ":" # Append a : before the flags
if (defaultpartition[p] > 0)
pname = pname "*" # Append * for the default partition
if (root_only[p] > 0)
pname = pname "R" # Append R for root-only partitions
if (hidden[p] > 0)
pname = pname "H" # Append H for hidden partitions
# Truncate long partition names and replace last character by a +
if (length(pname) > maxlength)
pname = sprintf("%*.*s+", maxlength-1, maxlength-1, pname)
if (pending_reboot[p] != "") state[p] = state[p] "@"
if (maintenance[p] != "") state[p] = state[p] "$" # Append $ for nodes in maintenance state
if (minmax == 0) {
memsize = memory[p] memoryplus[p]
cores = corespernode[p]
} else {
# Display min-max values
if (minmemory[p] == maxmemory[p])
memsize = memory[p]
else
memsize = minmemory[p] "-" maxmemory[p]
if (mincores[p] == maxcores[p])
cores = corespernode[p]
else
cores = mincores[p] "-" maxcores[p]
}
if (federation != "") # Print the cluster name
printf("%*.*s ", clusternamelength, clusternamelength, clustername[p])
# Flag free nodes and cores in GREEN if nothing is pending
if (freenodes[p] > 0 && pending_resources[p] == 0)
colornodes = GREEN
else
colornodes = NORMAL
if (freecores[p] > 0 && pending_resources[p] == 0)
colorcores = GREEN
else
colorcores = NORMAL
# Flag cores with pending_resources in RED
if (pending_resources[p] > 0)
colorpending = RED
else
colorpending = NORMAL
printf(columnfmt, maxlength, pname, state[p], nodes[p],
colornodes, freenodes[p], NORMAL,
totalcores[p], colorcores, freecores[p], NORMAL,
colorpending, pending_resources[p], NORMAL, pending_other[p],
minnodes[p], maxnodes[p], timelimit[p], cores, memsize)
if (printgres == 1) # Print the GRES information
print " " gres[p]
else
print ""
}
}'