Skip to content

Commit

Permalink
Updated.
Browse files Browse the repository at this point in the history
  • Loading branch information
jamesletts committed May 21, 2014
1 parent fd1d652 commit 9fa3435
Show file tree
Hide file tree
Showing 4 changed files with 166 additions and 25 deletions.
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,8 @@ Scripts:

* condor_history_dump.sh: Dumps schedd ClassAds for recently run jobs and the current queue.
* condor_history_analyze.sh: Analyzes the dump of the schedd ClassAds.


To do:

* Switch to using SiteDB API when possible? https://github.com/dmwm/WMCore/blob/master/src/python/WMCore/Services/SiteDB/SiteDB.py
53 changes: 32 additions & 21 deletions condor_check
Original file line number Diff line number Diff line change
Expand Up @@ -7,48 +7,60 @@ else
source $glideinWMSMonitor_RELEASE_DIR/bashrc
fi

POOLNAME=$1
SHORT=$2
COLLECTOR1=$1
COLLECTOR2=$2
# Collector2 could be discovered, but not at CERN somehow.
SHORT=$3
if [ "X"$SHORT == "X" ] ; then SHORT="long" ; fi

# Header
echo Summary Table for glidinWMS pool $POOLNAME at `/bin/date -u`
echo Summary Table for glidinWMS pool $COLLECTOR1 at `/bin/date -u`
echo

# Summary Table from the Collector
condor_status -pool $POOLNAME -schedd || exit 2
condor_status -pool $COLLECTOR1 -schedd || exit 2
echo

# Last Negotiation Cycle time
negotime=`condor_status -pool $POOLNAME -nego -l | grep LastNegotiationCycleDuration0 | awk '{print $3}'` || exit 3
echo "Negotiation time = ${negotime}s"
negotime=`condor_status -pool $COLLECTOR1 -nego -l | grep LastNegotiationCycleDuration0 | awk '{print $3}'` || exit 3
echo "Negotiation time (ideally under 300s) = ${negotime}s"
echo

# dont print the long table if asked
if [ $SHORT == "short" ] ; then
exit 0
fi

total1=`condor_status -schedd -total -pool $COLLECTOR1 | tail -1 | awk '{print int($2)}'`
total2=`condor_status -schedd -total -pool $COLLECTOR2 | tail -1 | awk '{print int($2)}'`
colldiff=`echo $total1 $total2 | awk '{print int($1-$2)}'`
echo "Difference between collector total job counts: $total1 ($COLLECTOR1) - $total2 ($COLLECTOR2) = $colldiff"
echo


# get information from sitedb about pledges and se names by CMSSite name
PLEDGES=`get_pledges_from_sitedb`
SEDFILE=`translate_se_names_in_sitedb_to_cmssite`

# get information from dashboard about avg and max usage by CMSSite name
USAGE=`dashboard_usage_by_site analysis "3 months ago"`
MAXUSAGE=`dashboard_usage_by_site all "3 months ago"`

# Get information from dashboard about avg and max usage by CMSSite name
# Information from T2_US_Wisconsin before April 3, 2014 is inflated.
USAGEUW=`dashboard_usage_by_site analysis "3 weeks ago"`
MAXUSAGEUW=`dashboard_usage_by_site all "3 weeks ago"`
#USAGE=`dashboard_usage_by_site analysis "3 months ago"`
#MAXUSAGE=`dashboard_usage_by_site all "3 months ago"`
#USAGEUW=`dashboard_usage_by_site analysis "3 weeks ago"`
#MAXUSAGEUW=`dashboard_usage_by_site all "3 weeks ago"`
USAGE=`dashboard_usage_by_site analysis "1 month ago"`
MAXUSAGE=`dashboard_usage_by_site all "1 month ago"`
USAGEUW=$USAGE
MAXUSAGEUW=$MAXUSAGE

# get information from SSB about site downtimes
DOWNTIMES=`site_downtimes_from_ssb`

# get claimed and running pilots, DESIRED_Sites for each pool:
CLAIMED=`get_pilots_by_site $POOLNAME -claimed` || exit 4
RUNNING=`get_pilots_by_site $POOLNAME` || exit 5
DESIRED=`get_DESIRED_Sites $POOLNAME` || exit 6
RUNNINGPROD=`get_pilots_by_site "vocms97.cern.ch"` || exit 7
CLAIMED=`get_pilots_by_site $COLLECTOR1 -claimed` || exit 4
RUNNING=`get_pilots_by_site $COLLECTOR1` || exit 5
DESIRED=`get_DESIRED_Sites $COLLECTOR1` || exit 6
RUNNINGPROD=`get_pilots_by_site "vocms97.cern.ch"`|| exit 7

# Print the table of pilots and pressure from queued jobs for each site
printf "%-20s%10s%10s%10s%10s%10s%10s%10s%10s%10s %-18s\n" "Site" "Pledge" "Pledged" "Average" "Maximum" "Claimed" "Unclaimed" "Pressure" "Exclusive" "Running" "Maintenance"
Expand Down Expand Up @@ -151,11 +163,10 @@ cat << EOF
Notes:
* Pledges are 50% of the last pledge entered in SiteDB for the site. Tier-1 pledges are set to zero for
analysis, even though analysis jobs can run at the Tier-1 sites.
* Usage statistics are from the last three months in Dashboard for analysis only.
* Usage numbers from Wisconsin in March are overinflated in the Dashboard before April 2014 so only the
past two weeks is used.
* The Site Table does not include DAG jobs (from CRAB3) which do not run at a DESIRED_Site, but rather on the
schedd.
* Average Usage statistics are from the last month in Dashboard for activity=analysis only.
* Maximum Usage statistics are from the last month in Dashboard for all activities including production.
* The Site Table does not include DAG jobs (from CRAB3) which do not run at a DESIRED_Site, but rather
on the schedd.
* Sites are only listed in the Site Table if there is demand (running or queued) or pledged resources.
EOF

Expand Down
16 changes: 12 additions & 4 deletions condor_check.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ cat >> $OUTFILE <<EOF
EOF
# run analysis of analysis ops pool, with a time limit of 300s.
alarm 300 $glideinWMSMonitor_RELEASE_DIR/condor_check glidein-collector-2.t2.ucsd.edu >> $OUTFILE
COLLECTOR1=glidein-collector-2.t2.ucsd.edu
COLLECTOR2=glidein-collector.t2.ucsd.edu
alarm 400 $glideinWMSMonitor_RELEASE_DIR/condor_check $COLLECTOR1 $COLLECTOR2 >> $OUTFILE
rc=$?

cat >> $OUTFILE <<EOF
Expand All @@ -30,21 +32,27 @@ cat >> $OUTFILE <<EOF
EOF
# run analysis of global pool, with a time limit of 300s.
alarm 300 $glideinWMSMonitor_RELEASE_DIR/condor_check vocms097.cern.ch short >> $OUTFILE
COLLECTOR1=vocms097.cern.ch
COLLECTOR2=vocms099.cern.ch
alarm 100 $glideinWMSMonitor_RELEASE_DIR/condor_check $COLLECTOR1 $COLLECTOR2 short >> $OUTFILE

cat >> $OUTFILE <<EOF
===================================================== PRODUCTION POOL =====================================================
EOF
alarm 300 $glideinWMSMonitor_RELEASE_DIR/condor_check vocms97.cern.ch short >> $OUTFILE
COLLECTOR1=vocms97.cern.ch
COLLECTOR2=unknown
alarm 100 $glideinWMSMonitor_RELEASE_DIR/condor_check $COLLECTOR1 $COLLECTOR2 short >> $OUTFILE

# if everything ran correctly, then update the latest file:
if [ $rc -eq 0 ] ; then
LINKNAME=$glideinWMSMonitor_OUTPUT_DIR/latest.txt
rm $LINKNAME
ln -s $OUTFILE $LINKNAME
fi

# make a nice html page as a test:
$glideinWMSMonitor_RELEASE_DIR/make_html_page.sh

exit 0
117 changes: 117 additions & 0 deletions dashboard_functions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -72,3 +72,120 @@ dashboard_usage_by_site() {
echo "$OUTPUTFILE"
return 0
}
#!/bin/bash

dashboard_exit_status() {
# print the exit status of jobs from the dashboard by site
#
# Usage:
# dashboard_exit_status begin_date end_date activity
# where dates are in the form YYYY-mm-dd and activity=[analysis|production|all|analysistest]
#
# Output:
# csv list by site: app-unknown,app-successful,app-failed,site-failed,cancelled,aborted,completed,site

date1=$1
date2=$2
activity=$3
url="http://dashb-cms-jobsmry.cern.ch/dashboard/request.py/jobnumbers_terminatedcsv?sites=All%20T3210&datatiers=All%20DataTiers&applications=All%20Application%20Versions&submissions=All%20Submission%20Types&accesses=All%20Access%20Types&activities=${activity}&sitesSort=7&start=${date1}&end=${date2}&timeRange=daily&granularity=daily&generic=0&sortBy=0&series=All&type=gstb"
curl -ks $url | dos2unix | awk -F\, '
BEGIN{
completed=0
appsuccessful=0
nsites=0
}
{
completed+=$7
appsuccessful+=$2
nsites+=1
}
END{
if ( completed > 0 ) {
successrate=appsuccessful/completed*100.0
} else {
successrate="N/A"
}
printf("%i,%i,%i,%s\n",nsites,completed,appsuccessful,successrate)
}'
return 0
}

dashboard_job_slots_used() {
# print the jobs slots used per day from the dashboard by site
#
# Usage:
# dashboard_job_slots_used begin_date end_date activity
# where dates are in the form YYYY-mm-dd and activity=[analysis|production|all|analysistest]
#
# Output:
# csv list by site: app-unknown,app-successful,app-failed,site-failed,cancelled,aborted,completed,site
date1=$1
date2=$2
activity=$3

url="http://dashb-cms-jobsmry.cern.ch/dashboard/request.py/jobnumberscsv?sites=All%20T3210&datatiers=All%20DataTiers&applications=All%20Application%20Versions&submissions=All%20Submission%20Types&accesses=All%20Access%20Types&activities=${activity}&sitesSort=7&start=${date1}&end=${date2}&timeRange=daily&granularity=daily&generic=0&sortBy=0&series=All&type=r"

curl -ks $url | dos2unix | sort -t \, -k 3 | awk -F\, '
BEGIN{
lastsite="None"
totaljobslots=0
totaldays=0
}
{
site=$3
if ( site != lastsite && lastsite != "None" ){
if ( totaldays > 0 ) {
slotsperday=int(totaljobslots/totaldays)
} else {
slotsperday=0
}
printf("%s,%i\n",lastsite,slotsperday)
totaljobslots=0
totaldays=0
}
lastsite=site
totaljobslots+=$1
totaldays+=1
}
END{
if ( totaldays > 0 ) {
slotsperday=int(totaljobslots/totaldays)
} else {
slotsperday=0
}
printf("%s,%i\n",site,slotsperday)
}'
return 0
}

dashboard_report() {
#
#
# ARGS: Granularity in days of the time period for the table
# ARGS: Number of time periods to display
GRANULARITY=$1
NUMBER_OF_PERIODS=$2
echo
echo "Analysis Jobs Report for the past $NUMBER_OF_PERIODS periods of $GRANULARITY days."
echo
printf "%10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s\n" \
"Begin" "End" "Analysis" "Analysis" "All" "Analysis" "Analysis" "Analysis" "Number" "Number" "App-"
printf "%10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s\n" \
"Date" "Date" "Job Slots" "Test" "Activity" "Job Slots" "Job Slots" "Job Slots" "of Sites" "of Jobs" "Success"
printf "%10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s\n" \
" " " " "Used" "Job Slots" "Job Slots" "at T1" "at T2" "at T3" " " "Completed" "Rate"
date1=`date -dlast-monday +%F`
for (( i=1; i<=$NUMBER_OF_PERIODS; i++ )) ; do
date2=$date1
date1=`date -d "$date2 -$GRANULARITY days" +%F`
printf "%10s %10s " $date1 $date2
dashboard_job_slots_used $date1 $date2 analysis | awk -F, 'BEGIN{x=0}{x+=$2}END{printf("%10i ",x)}'
dashboard_job_slots_used $date1 $date2 analysistest | awk -F, 'BEGIN{x=0}{x+=$2}END{printf("%10i ",x)}'
dashboard_job_slots_used $date1 $date2 all | awk -F, 'BEGIN{x=0}{x+=$2}END{printf("%10i ",x)}'
dashboard_job_slots_used $date1 $date2 analysis | awk -F, 'BEGIN{x=0}{if($1~/^T1/){x+=$2}}END{printf("%10i ",x)}'
dashboard_job_slots_used $date1 $date2 analysis | awk -F, 'BEGIN{x=0}{if($1~/^T2/){x+=$2}}END{printf("%10i ",x)}'
dashboard_job_slots_used $date1 $date2 analysis | awk -F, 'BEGIN{x=0}{if($1~/^T3/){x+=$2}}END{printf("%10i ",x)}'
dashboard_exit_status $date1 $date2 analysis | awk -F, '{printf("%10i %10i %10.1f%\n",$1,$2,$4)}'
done
return
}

0 comments on commit 9fa3435

Please sign in to comment.