-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcondor_check
executable file
·180 lines (148 loc) · 6.86 KB
/
condor_check
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
#!/bin/bash
# TODO:
# put raw outputs in json
# pressure and running from analysis and production separately
# update for MULTICORE
# maintenance broken?
if [ -z $glideinWMSMonitor_RELEASE_DIR ] ; then
echo "ERROR: glideinWMSMonitor source code missing."
exit 1
else
source $glideinWMSMonitor_RELEASE_DIR/bashrc
fi
COLLECTOR1=$1
COLLECTOR2=$2
if [ -z $COLLECTOR2 ] ; then
COLLECTOR2=`condor_status -pool $COLLECTOR1 -collector \
-format '%s\n' Machine | grep -v $COLLECTOR1 | tail -1`
fi
# Header
echo Summary Scheduler Table for glidinWMS pool $COLLECTOR1 at `/bin/date -u`
echo
# Summary Table from the Collector
condor_status -pool $COLLECTOR1 -schedd || exit 2
echo
# Last Negotiation Cycle time
negotime=`condor_status -pool $COLLECTOR1 -nego -l | grep ^LastNegotiationCycleDuration1 | awk '{print $3}'` || exit 3
echo "Negotiation time (ideally under 300s) T1 = ${negotime}s"
negotime=`condor_status -pool $COLLECTOR1 -nego -l | grep ^LastNegotiationCycleDuration2 | awk '{print $3}'` || exit 3
echo "Negotiation time (ideally under 300s) T2_US = ${negotime}s"
negotime=`condor_status -pool $COLLECTOR1 -nego -l | grep ^LastNegotiationCycleDuration0 | awk '{print $3}'` || exit 3
echo "Negotiation time (ideally under 300s) other = ${negotime}s"
echo
if [ $COLLECTOR2 != "unknown" ] ; then
total1=`condor_status -schedd -total -pool $COLLECTOR1 | tail -1 | awk '{print int($2)}'`
total2=`condor_status -schedd -total -pool $COLLECTOR2 | tail -1 | awk '{print int($2)}'`
colldiff=`echo $total1 $total2 | awk '{print int($1-$2)}'`
echo "Difference between collector total job counts: $total1 ($COLLECTOR1) - $total2 ($COLLECTOR2) = $colldiff"
echo
fi
# Frontend groups:
echo Frontend Group glidein counts:
condor_status -pool $COLLECTOR1 \
-format '%s\n' GLIDECLIENT_Group | sort | uniq -c
echo
if [ [ ! -z $COLLECTOR1 ] && [ ! -z $COLLECTOR2 ] ] ; then
echo Compare Collectors:
$glideinWMSMonitor_RELEASE_DIR/CompareCollectors.sh $COLLECTOR1 $COLLECTOR2
echo
fi
# get information from sitedb about pledges and se names by CMSSite name
PLEDGES=`get_pledges_from_sitedb`
SEDFILE=`translate_se_names_in_sitedb_to_cmssite`
# Get information from dashboard about avg and max usage by CMSSite name
ANAUSAGE=`dashboard_usage_by_site analysis "1 month ago"`
ALLUSAGE=`dashboard_usage_by_site all "1 month ago"`
# get information from SSB about site downtimes
DOWNTIMES=`site_downtimes_from_ssb`
# get claimed and running pilots CPUs, DESIRED_Sites for each pool:
CLAIMED=`get_pilot_cpus_by_site $COLLECTOR1 -claimed` || exit 4
RUNNING=`get_pilot_cpus_by_site $COLLECTOR1` || exit 5
DESIRED=`get_DESIRED_Sites $COLLECTOR1` || exit 6
DESIANA=`get_DESIRED_Sites $COLLECTOR1 \
-const '(AcctGroup=?="analysis")'` || exit 7
# Print the table of pilots and pressure from queued jobs for each site
printf "%-20s%10s%10s%10s%10s%10s%10s%10s%10s%10s%10s %-18s\n" "Site" "Pledge" "Pledged" "Analysis" "All" "Maximum" "Claimed" "Unclaimed" "Pressure" "Exclusive" "Pressure" "Maintenance"
printf "%30s%10s%10s%10s%10s%10s%10s%20s%10s%10s\n" "Updated" "to CMS" "Usage 1m" "Usage 1m" "Usage 1m" "CPUs" "CPUs" "Pressure" "CRAB3"
echo
# Sum up some interesting quantities:
totalpledge=0
totalclaimed=0
totalunclaimed=0
totalexclusivepressure=0
totalpledge=0
totalanausage=0
totalallusage=0
totalmaxusage=0
# Loop over sites for the table:
sites=`cat $SEDFILE | awk -F\/ '{print $3}' | sort | uniq`
for site in $sites ; do
# Claimed CPUs at site
claimed=`grep \{$site\} $CLAIMED | awk '{print $1}'`
if [ "X"$claimed == "X" ] ; then claimed=0 ; fi
totalclaimed=$[$totalclaimed+$claimed]
# Idle CPUs at site = running - claimed:
unclaimed=`grep \{$site\} $RUNNING | awk -v x=$claimed '{print $1-x}'`
if [ "X"$unclaimed == "X" ] ; then unclaimed=0 ; fi
totalunclaimed=$[$totalunclaimed+$unclaimed]
# Get queeud jobs for site (not exclusively):
pressure=`grep $site $DESIRED | wc -l`
pressana=`grep $site $DESIANA | wc -l`
# Get jobs queued only for this site:
exclusivepressure=`grep $site $DESIRED \
| awk -v site=$site '$1==site{print $0}' | wc -l`
totalexclusivepressure=$[$totalexclusivepressure+$exclusivepressure]
# Extract pledge for this site (total for all activities):
# Federation pledges may have duplicated entries (one SiteDB, one REBUS),
# so take the last one.
pledge=`grep ^$site\, $PLEDGES | tail -1 | awk -F\, '{print int($2)}'`
if [ "X"$pledge == "X" ] ; then pledge=0 ; fi
totalpledge=$[$totalpledge+$pledge]
# Extract date of the pledge
validityofpledge=`grep ^$site\, $PLEDGES | tail -1 | awk -F\, '{print $3}'`
if [ "X"$validityofpledge == "X" ] ; then validityofpledge="N/A" ; fi
# Extract average job slots at site from Dashboard for analysis
avgusage=`grep ^$site\, $ANAUSAGE | awk -F\, '{print int($2)}'`
if [ "X"$avgusage == "X" ] ; then avgusage=0 ; fi
totalanausage=$[$totalanausage+$avgusage]
# and for all activities
allusage=`grep ^$site\, $ALLUSAGE | awk -F\, '{print int($2)}'`
if [ "X"$allusage == "X" ] ; then allusage=0 ; fi
totalallusage=$[$totalallusage+$allusage]
# Extract max job slots at site from Dashboard (all activities)
maxusage=`grep ^$site\, $ALLUSAGE | awk -F\, '{print int($3)}'`
if [ "X"$maxusage == "X" ] ; then maxusage=0 ; fi
totalmaxusage=$[$totalmaxusage+$maxusage]
# downtimes
downtime=`grep ^$site\, $DOWNTIMES | awk -F\, '{print $2}'`
#if [ `echo $downtime | wc -w` -eq 0 ] ; then downtime="" ; fi
# skip meaningless entries, no pledge, no demand
if [ $[$pledge+$claimed+$unclaimed+$exclusivepressure] -eq 0 ] ; then
continue
fi
if [ $COLLECTOR1 != "vocms097.cern.ch" ] ; then
if [ $[$claimed+$unclaimed+$exclusivepressure] -eq 0 ] ; then
continue
fi
fi
printf "%-20s%10s%10s%10s%10s%10s%10s%10s%10s%10s%10s %-18s\n" $site $validityofpledge $pledge $avgusage $allusage $maxusage $claimed $unclaimed $pressure $exclusivepressure $pressana "$downtime"
done
# Table footer, sums:
totalpressure="N/A"
printf "\n%-30s%10s%10s%10s%10s%10s%10s%10s%10s\n" "SUM" $totalpledge $totalanausage $totalallusage $totalmaxusage $totalclaimed $totalunclaimed $totalpressure $totalexclusivepressure
echo
if [ $COLLECTOR1 == "vocms097.cern.ch" ] ; then
cat << EOF
Notes:
* Pledges are 100% of the last pledge entered in SiteDB for the site for all activities.
* Analysis Usage statistics are from the last month in Dashboard for activity=analysis only and includes
CRAB2 and CRAB3 (verify).
* All Activity Usage statistics are from the last month in Dashboard for all activities including production.
* The Site Table does not include DAG jobs (from CRAB3) which do not run at a DESIRED_Site, but rather
on the schedd.
* Sites are only listed in the Site Table if there is demand (running or queued) or pledged resources.
EOF
fi
# clean up temp files
rm $PLEDGES $SEDFILE $CLAIMED $RUNNING $DESIRED $ANAUSAGE $DOWNTIMES $ALLUSAGE
exit 0