diff --git a/dev/condor_functions.sh b/dev/condor_functions.sh new file mode 100644 index 0000000..b78bdc4 --- /dev/null +++ b/dev/condor_functions.sh @@ -0,0 +1,144 @@ +#!/bin/bash + +getClassAds() { + # Function to dump a set of ClassAds for queued, running and jobs + # from the past 24h of condor history. If the command fails remotely, + # then one can try to gsissh to the node to execute the query. + # + # Usage: + # getClassAds $POOLNAME $SCHEDDNAME $MACHINENAME "condor_history" + # getClassAds $POOLNAME $SCHEDDNAME $MACHINENAME "condor_q" + # Output: + # Space separated list of job ClassAds, one row per job. + # + # Called from condor_history_dump + + POOLNAME=$1 ; shift + SCHEDDNAME=$1 ; shift + MACHINENAME=$1 ; shift + + NOW=`/bin/date +%s` + YESTERDAY=$[$NOW-86400] + command="$@ \ + -const '(EnteredCurrentStatus>$YESTERDAY)' \ + -format 'JobStatus=%i\ ' JobStatus \ + -format 'LastJobStatus=%i\ ' LastJobStatus \ + -format 'ExitCode=%i\ ' ExitCode \ + -format 'EnteredCurrentStatus=%i\ ' EnteredCurrentStatus \ + -format 'ImageSize=%i\ ' ImageSize \ + -format 'RemoteWallClockTime=%i\ ' RemoteWallClockTime \ + -format 'RemoteUserCpu=%i\ ' RemoteUserCpu \ + -format 'LastRemoteHost=%s\ ' LastRemoteHost \ + -format 'MATCH_GLIDEIN_CMSSite=%s\ ' MATCH_GLIDEIN_CMSSite \ + -format 'DESRIED_Sites=%s\ ' DESIRED_Sites \ + -format 'DESRIED_SEs=%s\ ' DESIRED_SEs \ + -format 'Owner=%s\ ' Owner \ + -format 'AccountingGroup=%s\ ' AccountingGroup \ + -format 'Iwd=%s\ ' Iwd \ + -format 'HoldReasonCode=%i\ ' HoldReasonCode \ + -format 'HoldReasonSubCode=%i\ ' HoldReasonSubCode \ + -format 'HoldReason=%s\ ' HoldReason \ + -format 'GlobalJobId=%s\\n' GlobalJobId" + eval $command -pool $POOLNAME -name $SCHEDDNAME || gsissh $MACHINENAME $command + rc=$? + return $rc +} + +get_pilots_by_site() { + # Function to list the number of pilots running per site. + # + # Usage: + # get_pilots_by_site POOLNAME [optional args for condor_status] + # Output: + # File name of temporary file containing the numbers of pilots + # running at CMSSites, one line per site. + + PILOTS=`mktemp -t PILOTS.txt.XXXXXXX` || return 1 + condor_status -pool $@ -format '{%s}\n' GLIDEIN_CMSSite | sort | uniq -c > $PILOTS || return 2 + echo $PILOTS + return 0 +} + +get_DESIRED_Sites() { + # Get all queued jobs' DESIRED_Sites, translating from DESIRED_SEs + # if needed (i.e. for CRAB2). If DESIRED_Sites exists, take that. + # Otherwise take DESIRED_SEs and translate using SEDFILE from SiteDB. + # Note that DAG jobs do not have DESIRED_Sites defined since they + # run on a schedd and are not counted here. + # + # Usage: + # get_DESIRED_Sites $POOLNAME + # Output: + # File name of temporary file containing the list of DESIRES_Sites, + # one line per job. + + POOLNAME=$1 + + source $glideinWMSMonitor_RELEASE_DIR/sitedb_functions.sh + SEDFILE=`translate_se_names_in_sitedb_to_cmssite` + + SCHEDDS=`condor_status -pool $POOLNAME -const '(TotalIdleJobs>0)' -schedd -format ' -name %s' Name ` || return 1 + DESIRED=`mktemp -t DESIRED.txt.XXXXXXX` || return 2 + + # run condor_q if there are queued jobs in the pool only: + if [ `echo $SCHEDDS | wc -w` -ne 0 ] ; then + condor_q $SCHEDDS -pool $POOLNAME -const '(JobStatus=?=1)' \ + -format '%s' DESIRED_Sites -format ' %s' DESIRED_SEs -format ' %s\n' Owner \ + | sed 's/undefined//g' | awk '{print $1}' | sed -f $SEDFILE >> $DESIRED + fi + + echo $DESIRED + rm $SEDFILE + return 0 +} + +condor_history_dump() { + # Function to dump all ClassAds defined in getClassAds + # for all queued, running and jobs that completed in the + # past day in a glideinWMS pool. + # + # Usage: + # condor_history_dump $POOLNAME + # Output: + # ClassAd values, one line per job. + # + POOLNAME=$1 + SCHEDDS=`condor_status -pool $POOLNAME -schedd -format '%s\,' Name -format '%s\n' Machine` + for SCHEDD in $SCHEDDS ; do + SCHEDDNAME=` echo $SCHEDD | awk -F\, '{print $1}'` + MACHINENAME=`echo $SCHEDD | awk -F\, '{print $2}'` + getClassAds $POOLNAME $SCHEDDNAME $MACHINENAME "condor_history" + getClassAds $POOLNAME $SCHEDDNAME $MACHINENAME "condor_q" + done + return 0 +} + +condor_exit_codes() { + # Function to extract the possible matching CMSSW error codes given the %256 + # ExitCode from HTCondor. The CMSSW exit codes can be found in the $URL but + # it is not publically visible yet. Later you can download periodically to + # the release directory. + # + # Usage: + # condor_error_codes $CONDOR_EXIT_CODE + # Output: + # CMSSW exit code matches and text explanations. + # + CONDOR_EXIT_CODE=$1 + # + # Exit code explanation file + # + FILE=$glideinWMSMonitor_RELEASE_DIR/JobExitCodes + if [ ! -f $FILE ] ; then + URL="https://twiki.cern.ch/twiki/bin/view/CMSPublic/JobExitCodes" + curl -ks -o $FILE $URL + fi + # + # grep the explanation of a particular code(s). + # + grep \- $FILE | grep -o [0-9]*\ \-\ .* | sed 's/<.*>//g' \ + | awk -F\- -v code=$CONDOR_EXIT_CODE '(code==$1%256){print $0}' + return 0 +} + +# Condor Hold Reason Codes: http://research.cs.wisc.edu/htcondor/manual/v7.6/10_Appendix_A.html diff --git a/dev/monitor_functions.sh b/dev/monitor_functions.sh new file mode 100755 index 0000000..fa925f6 --- /dev/null +++ b/dev/monitor_functions.sh @@ -0,0 +1,205 @@ +#!/bin/bash + +if [ -z $glideinWMSMonitor_RELEASE_DIR ] ; then + echo "ERROR: glideinWMSMonitor source code missing." + exit 1 +else + source $glideinWMSMonitor_RELEASE_DIR/bashrc +fi + +export TMPDIR=$glideinWMSMonitor_OUTPUT_DIR/tmp_$$ +mkdir $TMPDIR +OUTFILE=$1 + +echo "{" > $OUTFILE + +pool_table() { + # Collector1 has the negotiator, Collector2 is the HA backup. + COLLECTOR1=$1 + COLLECTOR2=$2 + shift; shift + NAME=$@ + NOW=`/bin/date -u` + + negotime=`condor_status -pool $COLLECTOR1 -nego -l | grep LastNegotiationCycleDuration0 | awk '{print int($3)}'` + total1=`condor_status -schedd -total -pool $COLLECTOR1 | tail -1 | awk '{print int($2)}'` + total2=`condor_status -schedd -total -pool $COLLECTOR2 | tail -1 | awk '{print int($2)}'` + colldiff=`echo $total1 $total2 | awk '{print int($1-$2)}'` + + if [ -z $negotime ] ; then negotime=null ; fi + if [ -z $total1 ] ; then total1=null ; fi + if [ -z $total2 ] ; then total2=null ; fi + if [ -z $colldiff ] ; then colldiff=null ; fi + + cat <>$OUTFILE +pool_table vocms097.cern.ch vocms099.cern.ch "Global" >>$OUTFILE +pool_table vocms97.cern.ch cmssrv119.fnal.gov "Production" >>$OUTFILE + +cat <>$OUTFILE + "Site Table": { + "header": [ + "Site","Maintenance", + "Pledge Info","Total Pledge","Pledged Analysis", + "Average Analysis Usage","Maximum Analysis Usage", + "Average Analysis Test Usage","Maximum Analysis Test Usage", + "Average Production Usage","Maximum Production Usage", + "Average Total Usage","Maximum Total Usage", + "Claimed Pilots Analysis","Total Pilots Analysis","Analysis Pressure","Exclusive Analysis Pressure", + "Claimed Pilots Global","Total Pilots Global","Global Pressure","Exclusive Global Pressure", + "Claimed Pilots Production","Total Pilots Production","Production Pressure","Exclusive Production Pressure" + ], + "data": [ +EOF + +# get information from sitedb about pledges and se names by CMSSite name +DOWNTIME=`site_downtimes_from_ssb` +PLEDGES=`get_pledges_from_sitedb` +SEDFILE=`translate_se_names_in_sitedb_to_cmssite` + +# job slot usage from dashboard - $2 is avg and $3 is max job slots used in a day in the time period +USAGEANA=`dashboard_usage_by_site analysis "1 month ago"` +USAGETST=`dashboard_usage_by_site analysistest "1 month ago"` +USAGEPRO=`dashboard_usage_by_site production "1 month ago"` +USAGEALL=`dashboard_usage_by_site all "1 month ago"` + +# get claimed and running pilots, DESIRED_Sites for each pool: +CLAIMEDANA=`get_pilots_by_site glidein-collector.t2.ucsd.edu -claimed` +RUNNINGANA=`get_pilots_by_site glidein-collector.t2.ucsd.edu` +DESIREDANA=`get_DESIRED_Sites glidein-collector.t2.ucsd.edu` + +CLAIMEDGLO=`get_pilots_by_site vocms097.cern.ch -claimed` +RUNNINGGLO=`get_pilots_by_site vocms097.cern.ch` +DESIREDGLO=`get_DESIRED_Sites vocms097.cern.ch` + +CLAIMEDPRO=`get_pilots_by_site vocms97.cern.ch -claimed` +RUNNINGPRO=`get_pilots_by_site vocms97.cern.ch` +DESIREDPRO=`get_DESIRED_Sites vocms97.cern.ch` + + +# Loop over sites for the table: +sites=`cat $SEDFILE | awk -F\/ '{print $3}' | sort | uniq` +{ +for site in $sites ; do + downtime=`grep ^$site\, $DOWNTIME | awk -F\, '{print $2}'` + if [ -z "$downtime" ] ; then downtime=null ; fi + + # pledge is 50% unless site is >=T1, then it is 5% of total pledge + totalpledge=` grep ^$site\, $PLEDGES | tail -1 | awk -F\, '{print int($2)}'` + validityofpledge=`grep ^$site\, $PLEDGES | tail -1 | awk -F\, '{print $3}'` + if [ -z $totalpledge ] ; then totalpledge=null ; fi + if [ -z "$validityofpledge" ] ; then validityofpledge=null ; fi + if [ $totalpledge -eq 0 ] ; then totalpledge=null ; fi + + echo $site | egrep '^T0|^T1' >> /dev/null + if [ $? -eq 0 ] ; then + analysispledge=`echo $totalpledge | awk '{print int($1/10.)}'` + else + analysispledge=`echo $totalpledge | awk '{print int($1/2.0)}'` + fi + if [ -z $analysispledge ] ; then analysispledge=null ; fi + if [ $analysispledge -eq 0 ] ; then analysispledge=null ; fi + + avgusageana=`grep ^$site\, $USAGEANA | awk -F\, '{print int($2)}'` + maxusageana=`grep ^$site\, $USAGEANA | awk -F\, '{print int($3)}'` + avgusagetst=`grep ^$site\, $USAGETST | awk -F\, '{print int($2)}'` + maxusagetst=`grep ^$site\, $USAGETST | awk -F\, '{print int($3)}'` + avgusagepro=`grep ^$site\, $USAGEPRO | awk -F\, '{print int($2)}'` + maxusagepro=`grep ^$site\, $USAGEPRO | awk -F\, '{print int($3)}'` + avgusageall=`grep ^$site\, $USAGEALL | awk -F\, '{print int($2)}'` + maxusageall=`grep ^$site\, $USAGEALL | awk -F\, '{print int($3)}'` + + if [ -z $avgusageana ] ; then avgusageana=null ; fi + if [ -z $maxusageana ] ; then maxusageana=null ; fi + if [ -z $avgusagetst ] ; then avgusagetst=null ; fi + if [ -z $maxusagetst ] ; then maxusagetst=null ; fi + if [ -z $avgusagepro ] ; then avgusagepro=null ; fi + if [ -z $maxusagepro ] ; then maxusagepro=null ; fi + if [ -z $avgusageall ] ; then avgusageall=null ; fi + if [ -z $maxusageall ] ; then maxusageall=null ; fi + + claimedana=` grep \{$site\} $CLAIMEDANA | awk '{print int($1)}'` + runningana=` grep \{$site\} $RUNNINGANA | awk '{print int($1)}'` + pressureana=`grep $site $DESIREDANA | wc -l` + exclusivepressureana=`grep $site $DESIREDANA | awk -v site=$site '$1==site{print $0}' | wc -l` + + if [ -z $claimedana ] ; then claimedana=null ; fi + if [ -z $runningana ] ; then runningana=null ; fi + if [ -z $pressureana ] ; then pressureana=null ; fi + if [ -z $exclusivepressureana ] ; then exclusivepressureana=null ; fi + + claimedglo=` grep \{$site\} $CLAIMEDGLO | awk '{print int($1)}'` + runningglo=` grep \{$site\} $RUNNINGGLO | awk '{print int($1)}'` + pressureglo=`grep $site $DESIREDGLO | wc -l` + exclusivepressureglo=`grep $site $DESIREDGLO | awk -v site=$site '$1==site{print $0}' | wc -l` + + if [ -z $claimedglo ] ; then claimedglo=null ; fi + if [ -z $runningglo ] ; then runningglo=null ; fi + if [ -z $pressureglo ] ; then pressureglo=null ; fi + if [ -z $exclusivepressureglo ] ; then exclusivepressureglo=null ; fi + + claimedpro=` grep \{$site\} $CLAIMEDPRO | awk '{print int($1)}'` + runningpro=` grep \{$site\} $RUNNINGPRO | awk '{print int($1)}'` + pressurepro=`grep $site $DESIREDPRO | wc -l` + exclusivepressurepro=`grep $site $DESIREDPRO | awk -v site=$site '$1==site{print $0}' | wc -l` + + if [ -z $claimedpro ] ; then claimedpro=null ; fi + if [ -z $runningpro ] ; then runningpro=null ; fi + if [ -z $pressurepro ] ; then pressurepro=null ; fi + if [ -z $exclusivepressurepro ] ; then exclusivepressurepro=null ; fi + + printf ' ["%s","%s","%s",%s,%s,' $site "$downtime" "$validityofpledge" $totalpledge $analysispledge + printf '%s,%s,' $avgusageana $maxusageana + printf '%s,%s,' $avgusagetst $maxusagetst + printf '%s,%s,' $avgusagepro $maxusagepro + printf '%s,%s,' $avgusageall $maxusageall + printf '%s,%s,%s,%s,' $claimedana $runningana $pressureana $exclusivepressureana + printf '%s,%s,%s,%s,' $claimedglo $runningglo $pressureglo $exclusivepressureglo + printf '%s,%s,%s,%s],\n' $claimedpro $runningpro $pressurepro $exclusivepressurepro +done +} | sed '$s/,$//' | sed 's/\"null\"/null/g' >>$OUTFILE + + +# close the json file +cat <>$OUTFILE + ] + } +} +EOF + +# clean up temp files +rm $DOWNTIME $PLEDGES $SEDFILE $USAGEANA $USAGETST $USAGEPRO $USAGEALL \ + $CLAIMEDANA $RUNNINGANA $DESIREDANA $CLAIMEDGLO $RUNNINGGLO $DESIREDGLO \ + $CLAIMEDPRO $RUNNINGPRO $DESIREDPRO +rmdir $TMPDIR + +exit 0 diff --git a/dev/sitedb_functions.sh b/dev/sitedb_functions.sh new file mode 100644 index 0000000..9082af0 --- /dev/null +++ b/dev/sitedb_functions.sh @@ -0,0 +1,144 @@ +#!/bin/bash + +get_federation_pledges() { + if [ ! -z $1 ] ; then + year=$1 + else + year=2014 + fi + + # This function finds the pledges for certain federated Tier-2 sites + # from information in REBUS. Unfortunately the list of federations + # and their membership is not get discoverable. + + url="http://wlcg-rebus.cern.ch/apps/pledges/resources/$year/all/csv" + curl -ks $url | awk -F\, ' + BEGIN{totalpledge=0} + { + federation=$3 + cpu=$4 + pledge=$8 + if ( federation == "INFN T2 Federation" && cpu == "CPU" ) { + printf("T2_IT_Bari,%i,FEDERATION\n",pledge/10./4.) + printf("T2_IT_Legnaro,%i,FEDERATION\n",pledge/10./4.) + printf("T2_IT_Pisa,%i,FEDERATION\n",pledge/10./4.) + printf("T2_IT_Rome,%i,FEDERATION\n",pledge/10./4.) + } + if ( federation == "CMS Federation DESY RWTH Aachen" && cpu == "CPU" ) { + printf("T2_DE_DESY,%i,FEDERATION\n",pledge/10./2.) + printf("T2_DE_RWTH,%i,FEDERATION\n",pledge/10./2.) + } + if ( federation == "Belgian Tier-2 Federation" && cpu == "CPU" ) { + printf("T2_BE_IIHE,%i,FEDERATION\n",pledge/10./2.) + printf("T2_BE_UCL,%i,FEDERATION\n",pledge/10./2.) + } + if ( federation == "Russian Data-Intensive GRID" && cpu == "CPU" ) { + printf("T2_RU_IHEP,%i,FEDERATION\n",pledge/10./7.) + printf("T2_RU_INR,%i,FEDERATION\n",pledge/10./7.) + printf("T2_RU_ITEP,%i,FEDERATION\n",pledge/10./7.) + printf("T2_RU_JINR,%i,FEDERATION\n",pledge/10./7.) + printf("T2_RU_PNPI,%i,FEDERATION\n",pledge/10./7.) + printf("T2_RU_RRC_KI,%i,FEDERATION\n",pledge/10./7.) + printf("T2_RU_SINP,%i,FEDERATION\n",pledge/10./7.) + } + if ( federation == "London Tier 2" && cpu == "CPU" ) { + printf("T2_UK_London_IC,%i,FEDERATION\n",pledge/10./2.) + printf("T2_UK_London_Brunel,%i,FEDERATION\n",pledge/10./2.) + } + if ( federation == "SouthGrid" && cpu == "CPU" ) { + printf("T2_UK_SGrid_RALPP,%i,FEDERATION\n",pledge/10.) + } + if ( cpu == "CPU" ) { + totalpledge+=pledge + } + } + END{printf("All_Sites,%i,TOTAL\n",totalpledge/10.)}' + return 0 +} + +translate_site_names_from_sidedb_to_cmssite() { + + # output: name of a sed file to translate SiteDB site names to CMSSite names, in csv format + + # error if X509_USER_PROXY is not defined + if [ -z $X509_USER_PROXY ] ; then + echo "ERROR: X509_USER_PROXY not defined!" + return 1 + fi + + # this url will provide a mapping between sitesb site names like "ASGC" and + # CMSSite names like "T1_TW_ASCG" + + SEDFILE=`mktemp -t SITELIST.sed.XXXXXXXXXX` + url="https://cmsweb.cern.ch/sitedb/data/prod/site-names" + curl -ks --cert $X509_USER_PROXY --key $X509_USER_PROXY $url \ + | grep \"cms\" | awk -F\" '{print "s/^" $4 ",/" $6 ",/"}' | sed 's/ //g' > $SEDFILE + + echo "$SEDFILE" + return 0 +} + +get_pledges_from_sitedb() { + # output: comma separated list of CMSSite and latest CPU pledges + # in kHS06 divided by 10 to normalize roughly to cores. + # Federated pledges come at the end, so you need to take the + # last entry per site. Earlier (zero) entries may be from SitDB. + + # error if X509_USER_PROXY is not defined + + if [ -z $X509_USER_PROXY ] ; then + echo "ERROR: X509_USER_PROXY not defined!" + return 1 + fi + + # this url gives pledges by sitedb name like "ASGC" + + url="https://cmsweb.cern.ch/sitedb/data/prod/resource-pledges" + thisyear=`/bin/date +%Y` + TMPFILE=`mktemp -t TMPPLEDGES.txt.XXXXXXXXXX` || return 1 + + # get pledges from sitedb only for this year and translate + # to CMSSite name not the generic site name + + SEDFILE=`translate_site_names_from_sidedb_to_cmssite` + curl -ks --cert $X509_USER_PROXY --key $X509_USER_PROXY $url \ + | awk -F\, -v ty=$thisyear '($4==ty){print $2 "," $3 "," $5}' \ + | tr \[ \ | tr \" \ | sed 's/ //g' | sort | sed -f $SEDFILE | sort > $TMPFILE + + # Remove multiple pledges for the same site ($1) for this year + # by taking the most recently entered ($2). Approximate kHS06 + # to physical cpu by dividing by 10. + + PLEDGES=`mktemp -t PLEDGES.txt.XXXXXXXXXX` || return 2 + sites=`cat $TMPFILE | awk -F\, '{print $1}' | sort | uniq | grep ^T` + for site in $sites ; do + grep ^$site\, $TMPFILE | tail -1 | awk -F\, '{print $1 "," int($3*1000./10.) "," strftime("%F",$2)}' >> $PLEDGES + done + + # corrections for federation pledges. Always take the last one + get_federation_pledges >> $PLEDGES + + rm $TMPFILE $SEDFILE + echo "$PLEDGES" + return 0 +} + +translate_se_names_in_sitedb_to_cmssite() { + # output: name of sed file to translate SE names to CMSSite name + + SELIST=`mktemp -t SELIST.sed.XXXXXXXXXX` || exit 1 + SEDFILE=`translate_site_names_from_sidedb_to_cmssite` + + # from SiteDB get the list of SE, sitedb site name + # and translate sitedb site name to CMSSite name with $SEDFILE + # output as a sed file + + url="https://cmsweb.cern.ch/sitedb/data/prod/site-resources" + curl -ks --cert $X509_USER_PROXY --key $X509_USER_PROXY $url \ + | grep \"SE\" | awk -F\" '{print $2 "," $6}' | sed 's/ //g' \ + | sed -f $SEDFILE | sort | awk -F\, '{print "s/" $2 "/" $1 "/"}' > $SELIST + + rm $SEDFILE + echo "$SELIST" + return 0 +}