From 0010ca2f605add302c3096da4e4ebcc77258f8ad Mon Sep 17 00:00:00 2001 From: JAMES LETTS Date: Tue, 29 Apr 2014 20:28:32 -0700 Subject: [PATCH] Cleaning up. --- README.md | 4 ++ bashrc | 5 +- condor_check | 2 +- condor_check.sh | 0 condor_functions.sh | 0 condor_history_analyze.sh | 104 ++++++++++++++++++++++++++++++-------- condor_history_dump.sh | 2 +- dashboard_functions.sh | 0 sitedb_functions.sh | 0 9 files changed, 92 insertions(+), 25 deletions(-) mode change 100644 => 100755 condor_check.sh mode change 100644 => 100755 condor_functions.sh mode change 100644 => 100755 condor_history_analyze.sh mode change 100644 => 100755 condor_history_dump.sh mode change 100644 => 100755 dashboard_functions.sh mode change 100644 => 100755 sitedb_functions.sh diff --git a/README.md b/README.md index 0d31b45..c24a01d 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,10 @@ JAMES LETTS, April 27, 2014. Monitoring scripts for glideinWMS pools in CMS. +Setup file: + + * bashrc Customise the location of the condor source file and grid proxy. + Include files are: * condor_functions.sh: General queries for schedds and the collector. diff --git a/bashrc b/bashrc index c135bd6..c9fffc9 100644 --- a/bashrc +++ b/bashrc @@ -6,10 +6,13 @@ export X509_USER_PROXY=/tmp/x509up_u500 # source HTCondor commands source /etc/profile.d/condor.sh +# place for the output files +export glideinWMSMonitor_OUTPUT_DIR="/crabprod/CSstoragePath/Monitor" + #################### MACE CHANGES ABOVE THIS LINE #################### # Discover the directory where the software sits: -export glideinWMSMonitor_RELEASE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +export glideinWMSMonitor_RELEASE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" # source the functions to discover SiteDB, HTCondor and CMS Dashboard information source $glideinWMSMonitor_RELEASE_DIR/sitedb_functions.sh diff --git a/condor_check b/condor_check index 0ffdeec..416cab9 100644 --- a/condor_check +++ b/condor_check @@ -1,7 +1,7 @@ #!/bin/bash if [ -z $glideinWMSMonitor_RELEASE_DIR ] ; then - echo "ERROR: source code missing." + echo "ERROR: glideinWMSMonitor source code missing." exit 1 else source $glideinWMSMonitor_RELEASE_DIR/bashrc diff --git a/condor_check.sh b/condor_check.sh old mode 100644 new mode 100755 diff --git a/condor_functions.sh b/condor_functions.sh old mode 100644 new mode 100755 diff --git a/condor_history_analyze.sh b/condor_history_analyze.sh old mode 100644 new mode 100755 index a18486b..1e0572a --- a/condor_history_analyze.sh +++ b/condor_history_analyze.sh @@ -3,24 +3,34 @@ # DAG jobs not included in held table. # report CRAB3 vs CRAB2 jobs -source /home/letts/scripts/condor_functions.sh POOLNAME=$1 +if [ -z $glideinWMSMonitor_RELEASE_DIR ] ; then + echo "ERROR: glideinWMSMonitor source code missing." + exit 1 +fi + # get the latest dumped history file from the web server: -FILE=/crabprod/CSstoragePath/Monitor/`ls -1rt /crabprod/CSstoragePath/Monitor \ +FILE=$glideinWMSMonitor_OUTPUT_DIR/`ls -1rt /crabprod/CSstoragePath/Monitor \ | grep ^monitor-anaops-history | grep \.txt$ | tail -1` -echo HISTORY FILE: $FILE NOW=`ls -l --time-style=+%s $FILE | awk '{print $6}'` +cat <0){print $1/$2*100.0}else{print 0}}'` -goodpct=` echo $goodWC $totalWC | awk '{if($2>0){print $1/$2*100.0}else{print 0}}'` -badpct=` echo $badWC $totalWC | awk '{if($2>0){print $1/$2*100.0}else{print 0}}'` - abortCPU=`grep "^JobStatus=3" $FILE | grep -o RemoteUserCpu=[0-9]* \ | awk -F\= 'BEGIN{x=0}{x+=$2}END{print int(x/86400.)}'` goodCPU=` grep "^JobStatus=4" $FILE | grep 'ExitCode=0\ ' | grep -o RemoteUserCpu=[0-9]* \ @@ -46,21 +51,71 @@ goodCPU=` grep "^JobStatus=4" $FILE | grep 'ExitCode=0\ ' | grep -o RemoteU badCPU=` grep "^JobStatus=4" $FILE | grep -v 'ExitCode=0\ ' | grep -o RemoteUserCpu=[0-9]* \ | awk -F\= 'BEGIN{x=0}{x+=$2}END{print int(x/86400.)}'` +# need to REMOVE DAG JOBS FROM HERE!! +read nheld heldWC heldCPU <<< $(grep "^JobStatus=5" $FILE | awk -v now=$NOW ' \ +BEGIN { + yesterday=now-86400 + SumRemoteWallClockTime=0 + SumRemoteUserCpu=0 + SumHeld=0 +} +{ + skip=1 + for (i=1; i<=NF; i++) { + split($i,subfields,"=") + if (subfields[1]=="EnteredCurrentStatus") { + EnteredCurrentStatus=subfields[2] + if (EnteredCurrentStatus>yesterday) { skip=0 } + } + if (skip==0 && subfields[1]=="RemoteWallClockTime") { SumRemoteWallClockTime+=subfields[2] } + if (skip==0 && subfields[1]=="RemoteUserCpu") { SumRemoteUserCpu+=subfields[2] } + } + if ( skip==0 ) { SumHeld+=1 } +} +END{ + SumRemoteWallClockTime/=86400. + SumRemoteUserCpu/=86400. + HeldPerJob=SumRemoteWallClockTime/SumHeld*24. + HeldEff=SumRemoteUserCpu/SumRemoteWallClockTime*100. + #printf "Held %10i %10.1f %10.1f %10.1f\n",SumHeld,SumRemoteWallClockTime,HeldPerJob,HeldEff + print SumHeld + print SumRemoteWallClockTime + print SumRemoteUserCpu +} +') + +ntotal=` echo $nabort $ngood $nbad $nheld | awk '{print $1+$2+$3+$5}'` +totalWC=` echo $abortWC $goodWC $badWC $heldWC | awk '{print $1+$2+$3+$5}'` +totalCPU=`echo $abortCPU $goodCPU $badCPU $heldCPU | awk '{print $1+$2+$3+$5}'` + +abortpct=`echo $abortWC $totalWC | awk '{if($2>0){print $1/$2*100.0}else{print 0}}'` +goodpct=` echo $goodWC $totalWC | awk '{if($2>0){print $1/$2*100.0}else{print 0}}'` +badpct=` echo $badWC $totalWC | awk '{if($2>0){print $1/$2*100.0}else{print 0}}'` +heldpct=` echo $heldWC $totalWC | awk '{if($2>0){print $1/$2*100.0}else{print 0}}'` + aborteff=` echo $abortCPU $abortWC | awk '{if($2>0){print $1/$2*100.0}else{print 0}}'` goodeff=` echo $goodCPU $goodWC | awk '{if($2>0){print $1/$2*100.0}else{print 0}}'` badeff=` echo $badCPU $badWC | awk '{if($2>0){print $1/$2*100.0}else{print 0}}'` +heldeff=` echo $heldCPU $heldWC | awk '{if($2>0){print $1/$2*100.0}else{print 0}}'` +totaleff=` echo $totalCPU $totalWC | awk '{if($2>0){print $1/$2*100.0}else{print 0}}'` abortperjob=`echo $abortWC $nabort | awk '{if($2>0){print $1/$2*24.}else{print 0}}'` goodperjob=` echo $goodWC $ngood | awk '{if($2>0){print $1/$2*24.}else{print 0}}'` badperjob=` echo $badWC $nbad | awk '{if($2>0){print $1/$2*24.}else{print 0}}'` +heldperjob=` echo $heldWC $nheld | awk '{if($2>0){print $1/$2*24.}else{print 0}}'` +totalperjob=`echo $totalWC $ntotal | awk '{if($2>0){print $1/$2*24.}else{print 0}}'` echo -echo "SUMMARY TABLE OF JOBS WHICH COMPLETED IN THE PAST 24 HOURS (not including Held jobs):" +echo "SUMMARY TABLE OF JOBS WHICH COMPLETED IN THE PAST 24 HOURS:" echo -printf "Job Result %10s %10s %10s %10s %10s\n" "Number" "WC(d)" "WC(%)" "WC/job(h)" "CPU/WC(%)" -printf "ExitCode=0 %10s %10s %10.1f %10.1f %10.1f\n" $ngood $goodWC $goodpct $goodperjob $goodeff -printf "ExitCode!=0 %10s %10s %10.1f %10.1f %10.1f\n" $nbad $badWC $badpct $badperjob $badeff -printf "Aborted %10s %10s %10.1f %10.1f %10.1f\n" $nabort $abortWC $abortpct $abortperjob $aborteff +printf "Job Result %10s %10s %10s %10s %10s\n" "Number" "WC(d)" "WC(%)" "WC/job(h)" "CPU/WC(%)" +printf "ExitCode=0 %10.0f %10.0f %10.1f %10.1f %10.1f\n" $ngood $goodWC $goodpct $goodperjob $goodeff +printf "ExitCode!=0 %10.0f %10.0f %10.1f %10.1f %10.1f\n" $nbad $badWC $badpct $badperjob $badeff +printf "Removed %10.0f %10.0f %10.1f %10.1f %10.1f\n" $nabort $abortWC $abortpct $abortperjob $aborteff +printf "Held %10.0f %10.0f %10.1f %10.1f %10.1f\n" $nheld $heldWC $heldpct $heldperjob $heldeff +echo +printf "Sum %10.0f %10.0f %10.1f %10.1f %10.1f\n" $ntotal $totalWC "100" $totalperjob $totaleff + echo echo EXIT CODE BREAKDOWN OF COMPLETED JOBS: @@ -101,6 +156,8 @@ echo HELD JOBS IN THE PAST 24 HOURS: echo printf "%-20s %8s %8s %8s %10s\n" "Site" "Held Jobs" "Users" "Pilots" "WC(d)" + + grep "^JobStatus=5" $FILE | awk -v now=$NOW ' \ { MATCH_GLIDEIN_CMSSite=unknown @@ -168,10 +225,13 @@ END { } ' | grep ^T | sort +exit + echo echo echo USER PRIORITIES: echo -condor_userprio -allusers -all -pool $POOLNAME +#condor_userprio -allusers -all -pool $POOLNAME +condor_userprio -all -pool $POOLNAME exit diff --git a/condor_history_dump.sh b/condor_history_dump.sh old mode 100644 new mode 100755 index c355493..2e85b1d --- a/condor_history_dump.sh +++ b/condor_history_dump.sh @@ -1,5 +1,4 @@ #!/bin/sh -source /home/letts/scripts/condor_functions.sh POOLNAME="glidein-collector-2.t2.ucsd.edu" OUTFILE=/crabprod/CSstoragePath/Monitor/monitor-anaops-history-`/bin/date +%F-Z%R -u`.txt @@ -9,4 +8,5 @@ mv ${OUTFILE}.tmp $OUTFILE OUTFILE=/crabprod/CSstoragePath/Monitor/latest-history.txt /home/letts/scripts/condor_history_analyze.sh $POOLNAME > ${OUTFILE}.tmp mv ${OUTFILE}.tmp $OUTFILE + exit diff --git a/dashboard_functions.sh b/dashboard_functions.sh old mode 100644 new mode 100755 diff --git a/sitedb_functions.sh b/sitedb_functions.sh old mode 100644 new mode 100755