diff --git a/condor_history_analyze.sh b/condor_history_analyze.sh new file mode 100644 index 0000000..e41077b --- /dev/null +++ b/condor_history_analyze.sh @@ -0,0 +1,175 @@ +#!/bin/sh +# discover the exit code explantions? +# DAG jobs not included in held table. +# report CRAB3 vs CRAB2 jobs + +source /home/letts/scripts/condor_functions.sh +POOLNAME=$1 + +# get the latest dumped history file from the web server: +FILE=/crabprod/CSstoragePath/Monitor/`ls -1rt /crabprod/CSstoragePath/Monitor \ + | grep ^monitor-anaops-history | grep \.txt$ | tail -1` +echo HISTORY FILE: $FILE +NOW=`ls -l --time-style=+%s $FILE | awk '{print $6}'` + +echo +echo "SCHEDDS CONSIDERED IN THE HISTORY:" +echo +echo " Jobs Schedd" +cat $FILE | grep -o GlobalJobId=.* | awk -F\= '{print $2}' | awk -F\# '{print $1}' | sort | uniq -c +echo +echo "N.B. Job counts include both finished, queued and running jobs." +echo + +nabort=`grep "^JobStatus=3" $FILE | wc -l` +ngood=` grep "^JobStatus=4" $FILE | grep 'ExitCode=0\ ' | wc -l` +nbad=` grep "^JobStatus=4" $FILE | grep -v 'ExitCode=0\ ' | wc -l` + +abortWC=`grep "^JobStatus=3" $FILE | grep -o RemoteWallClockTime=[0-9]* \ + | awk -F\= 'BEGIN{x=0}{x+=$2}END{print int(x/86400.)}'` +goodWC=` grep "^JobStatus=4" $FILE | grep 'ExitCode=0\ ' | grep -o RemoteWallClockTime=[0-9]* \ + | awk -F\= 'BEGIN{x=0}{x+=$2}END{print int(x/86400.)}'` +badWC=` grep "^JobStatus=4" $FILE | grep -v 'ExitCode=0\ ' | grep -o RemoteWallClockTime=[0-9]* \ + | awk -F\= 'BEGIN{x=0}{x+=$2}END{print int(x/86400.)}'` + +totalWC=$[$abortWC+$goodWC+$badWC] +abortpct=`echo $abortWC $totalWC | awk '{if($2>0){print $1/$2*100.0}else{print 0}}'` +goodpct=` echo $goodWC $totalWC | awk '{if($2>0){print $1/$2*100.0}else{print 0}}'` +badpct=` echo $badWC $totalWC | awk '{if($2>0){print $1/$2*100.0}else{print 0}}'` + +abortCPU=`grep "^JobStatus=3" $FILE | grep -o RemoteUserCpu=[0-9]* \ + | awk -F\= 'BEGIN{x=0}{x+=$2}END{print int(x/86400.)}'` +goodCPU=` grep "^JobStatus=4" $FILE | grep 'ExitCode=0\ ' | grep -o RemoteUserCpu=[0-9]* \ + | awk -F\= 'BEGIN{x=0}{x+=$2}END{print int(x/86400.)}'` +badCPU=` grep "^JobStatus=4" $FILE | grep -v 'ExitCode=0\ ' | grep -o RemoteUserCpu=[0-9]* \ + | awk -F\= 'BEGIN{x=0}{x+=$2}END{print int(x/86400.)}'` + +aborteff=` echo $abortCPU $abortWC | awk '{if($2>0){print $1/$2*100.0}else{print 0}}'` +goodeff=` echo $goodCPU $goodWC | awk '{if($2>0){print $1/$2*100.0}else{print 0}}'` +badeff=` echo $badCPU $badWC | awk '{if($2>0){print $1/$2*100.0}else{print 0}}'` + +abortperjob=`echo $abortWC $nabort | awk '{if($2>0){print $1/$2*24.}else{print 0}}'` +goodperjob=` echo $goodWC $ngood | awk '{if($2>0){print $1/$2*24.}else{print 0}}'` +badperjob=` echo $badWC $nbad | awk '{if($2>0){print $1/$2*24.}else{print 0}}'` + +echo +echo "SUMMARY TABLE OF JOBS WHICH COMPLETED IN THE PAST 24 HOURS (not including Held jobs):" +echo +printf "Job Result %10s %10s %10s %10s %10s\n" "Number" "WC(d)" "WC(%)" "WC/job(h)" "CPU/WC(%)" +printf "ExitCode=0 %10s %10s %10.1f %10.1f %10.1f\n" $ngood $goodWC $goodpct $goodperjob $goodeff +printf "ExitCode!=0 %10s %10s %10.1f %10.1f %10.1f\n" $nbad $badWC $badpct $badperjob $badeff +printf "Aborted %10s %10s %10.1f %10.1f %10.1f\n" $nabort $abortWC $abortpct $abortperjob $aborteff + +echo +echo EXIT CODE BREAKDOWN OF COMPLETED JOBS: +echo +printf "%8s %9s %9s %-11s\n" "Jobs" "Condor" "CMSSW" "Explanation" +printf "%18s %9s\n" "ExitCode" "ExitCode" +cat $FILE | grep "^JobStatus=4" | grep -o ExitCode=[0-9]* | sort | uniq -c \ + | awk '($1>100){print $0}' | tr \= \ | sort -n -r -k 1 \ + | awk ' +{ + id=0 + explanation="not listed yet" + if ($3==0) {id=0; explanation="Success"} + if ($3==59) {id=10043; explanation="Unable to bootstrap WMCore libraries (most likely site python is broken)"} + if ($3==65) {id=8001; explanation="Other CMS Exception, or 65: End of job from user application (CMSSW)"} + if ($3==83) {id=8019; explanation="FileInPathError"} + if ($3==84) {id=8020; explanation="FileOpenError (Likely a site error), or 84: Some required file not found"} + if ($3==85) {id=8021; explanation="FileReadError (May be a site error)"} + if ($3==92) {id=8028; explanation="FileOpenError with fallback"} + if ($3==112) {id=70000; explanation="Output_sandbox too big for WMS, or 50800: Application segfaulted"} + if ($3==127) {id=127; explanation="Error while loading shared library"} + if ($3==142) {id=60302; explanation="Output file(s) not found"} + if ($3==147) {id=60307; explanation="Failed to copy an output file to the SE"} + if ($3==148) {id=60308; explanation="An output file was saved to fall back local SE after failing to copy"} + if ($3==157) {id=60317; explanation="Forced timeout for stuck stage out"} + if ($3==158) {id=60318; explanation="Internal error in Crab cmscp.py stageout script"} + if ($3==195) {id=50115; explanation="cmsRun did not produce a valid job report at runtime (often means cmsRun segfaulted)"} + if ($3==228) {id=50660; explanation="Application terminated by wrapper because using too much RAM (RSS)"} + if ($3==232) {id=50664; explanation="Application terminated by wrapper because using too much Wall Clock time"} + if ($3==237) {id=50669; explanation="Application terminated by wrapper for not defined reason"} + printf("%8i %9i %9i %-24s\n",$1,$3,id,explanation) +}' +echo +echo "N.B. Exit Code explanations copied from https://twiki.cern.ch/twiki/bin/viewauth/CMS/JobExitCodes" + +echo +echo HELD JOBS IN THE PAST 24 HOURS: +echo +printf "%-20s %8s %8s %8s %10s\n" "Site" "Held Jobs" "Users" "Pilots" "WC(d)" + +grep "^JobStatus=5" $FILE | awk -v now=$NOW ' \ +{ + MATCH_GLIDEIN_CMSSite=unknown + Owner=unknown + LastRemoteHost=unknown + + HoldReasonCode=0 + HoldReasonSubCode=0 + RemoteWallClockTime=0 + RemoteUserCpu=0 + + yesterday=now-86400 + skip=1 + + for (i=1; i<=NF; i++) { + split($i,subfields,"=") + if (subfields[1]=="EnteredCurrentStatus") { + EnteredCurrentStatus=subfields[2] + if (EnteredCurrentStatus>yesterday) { + skip=0 + } + } + if (subfields[1]=="MATCH_GLIDEIN_CMSSite") { MATCH_GLIDEIN_CMSSite=subfields[2] } + if (subfields[1]=="Owner") { Owner=subfields[2] } + if (subfields[1]=="LastRemoteHost") { LastRemoteHost=subfields[2] } + if (subfields[1]=="HoldReasonCode") { HoldReasonCode=subfields[2] } + if (subfields[1]=="HoldReasonSubCode") { HoldReasonSubCode=subfields[2] } + if (subfields[1]=="RemoteWallClockTime") { RemoteWallClockTime=subfields[2] } + if (subfields[1]=="RemoteUserCpu") { RemoteUserCpu=subfields[2] } + } + + if ( skip==0 && MATCH_GLIDEIN_CMSSite~/^T/ ) { + HeldJobs[MATCH_GLIDEIN_CMSSite]+=1 + HeldOwners[MATCH_GLIDEIN_CMSSite,Owner]+=1 + HeldPilots[MATCH_GLIDEIN_CMSSite,LastRemoteHost]+=1 + HeldWCtime[MATCH_GLIDEIN_CMSSite]+=RemoteWallClockTime + HeldUserCpu[MATCH_GLIDEIN_CMSSite]+=RemoteUserCpu + } +} +END { + SumHeldJobs=0 + SumWallClockTime=0 + for ( site in HeldJobs ) { + + nOwners=0 + for ( combined in HeldOwners ) { + split(combined, separate, SUBSEP) + if ( site == separate[1] ) { nOwners+=1 } + } + + nPilots=0 + for ( combined in HeldPilots ) { + split(combined, separate, SUBSEP) + if ( site == separate[1] ) { nPilots+=1 } + } + + WCtime=HeldWCtime[site]/86400. + + SumHeldJobs+=HeldJobs[site] + SumWallClockTime+=WCtime + + printf "%-20s %8i %8i %8i %10.1f\n",site,HeldJobs[site],nOwners,nPilots,WCtime + } + printf "TOTAL %24i %28.1f\n", SumHeldJobs, SumWallClockTime +} +' | grep ^T | sort + +echo +echo +echo USER PRIORITIES: +echo +condor_userprio -allusers -all -pool $POOLNAME + +exit