Skip to content

Commit

Permalink
Cleaning up.
Browse files Browse the repository at this point in the history
  • Loading branch information
jamesletts committed Apr 30, 2014
1 parent 95954b3 commit 0010ca2
Show file tree
Hide file tree
Showing 9 changed files with 92 additions and 25 deletions.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ JAMES LETTS, April 27, 2014.

Monitoring scripts for glideinWMS pools in CMS.

Setup file:

* bashrc Customise the location of the condor source file and grid proxy.

Include files are:

* condor_functions.sh: General queries for schedds and the collector.
Expand Down
5 changes: 4 additions & 1 deletion bashrc
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,13 @@ export X509_USER_PROXY=/tmp/x509up_u500
# source HTCondor commands
source /etc/profile.d/condor.sh

# place for the output files
export glideinWMSMonitor_OUTPUT_DIR="/crabprod/CSstoragePath/Monitor"

#################### MACE CHANGES ABOVE THIS LINE ####################

# Discover the directory where the software sits:
export glideinWMSMonitor_RELEASE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
export glideinWMSMonitor_RELEASE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"

# source the functions to discover SiteDB, HTCondor and CMS Dashboard information
source $glideinWMSMonitor_RELEASE_DIR/sitedb_functions.sh
Expand Down
2 changes: 1 addition & 1 deletion condor_check
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash

if [ -z $glideinWMSMonitor_RELEASE_DIR ] ; then
echo "ERROR: source code missing."
echo "ERROR: glideinWMSMonitor source code missing."
exit 1
else
source $glideinWMSMonitor_RELEASE_DIR/bashrc
Expand Down
Empty file modified condor_check.sh
100644 → 100755
Empty file.
Empty file modified condor_functions.sh
100644 → 100755
Empty file.
104 changes: 82 additions & 22 deletions condor_history_analyze.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -3,24 +3,34 @@
# DAG jobs not included in held table.
# report CRAB3 vs CRAB2 jobs

source /home/letts/scripts/condor_functions.sh
POOLNAME=$1

if [ -z $glideinWMSMonitor_RELEASE_DIR ] ; then
echo "ERROR: glideinWMSMonitor source code missing."
exit 1
fi

# get the latest dumped history file from the web server:
FILE=/crabprod/CSstoragePath/Monitor/`ls -1rt /crabprod/CSstoragePath/Monitor \
FILE=$glideinWMSMonitor_OUTPUT_DIR/`ls -1rt /crabprod/CSstoragePath/Monitor \
| grep ^monitor-anaops-history | grep \.txt$ | tail -1`
echo HISTORY FILE: $FILE
NOW=`ls -l --time-style=+%s $FILE | awk '{print $6}'`

cat <<EOF
HISTORY FILE: $FILE
SCHEDDS CONSIDERED IN THE HISTORY:
Queued
Jobs Schedd Name
EOF
grep '^JobStatus=[125]' $FILE | grep -o GlobalJobId=.* | awk -F\= '{print $2}' | awk -F\# '{print $1}' | sort | uniq -c
cat <<EOF
Done
Jobs Schedd Name
EOF
grep '^JobStatus=[34]' $FILE | grep -o GlobalJobId=.* | awk -F\= '{print $2}' | awk -F\# '{print $1}' | sort | uniq -c
echo
echo "SCHEDDS CONSIDERED IN THE HISTORY:"
/bin/date -u
/bin/date
echo
echo " Jobs Schedd"
cat $FILE | grep -o GlobalJobId=.* | awk -F\= '{print $2}' | awk -F\# '{print $1}' | sort | uniq -c
echo
echo "N.B. Job counts include both finished, queued and running jobs."
echo

nabort=`grep "^JobStatus=3" $FILE | wc -l`
Expand All @@ -34,33 +44,78 @@ goodWC=` grep "^JobStatus=4" $FILE | grep 'ExitCode=0\ ' | grep -o RemoteW
badWC=` grep "^JobStatus=4" $FILE | grep -v 'ExitCode=0\ ' | grep -o RemoteWallClockTime=[0-9]* \
| awk -F\= 'BEGIN{x=0}{x+=$2}END{print int(x/86400.)}'`

totalWC=$[$abortWC+$goodWC+$badWC]
abortpct=`echo $abortWC $totalWC | awk '{if($2>0){print $1/$2*100.0}else{print 0}}'`
goodpct=` echo $goodWC $totalWC | awk '{if($2>0){print $1/$2*100.0}else{print 0}}'`
badpct=` echo $badWC $totalWC | awk '{if($2>0){print $1/$2*100.0}else{print 0}}'`

abortCPU=`grep "^JobStatus=3" $FILE | grep -o RemoteUserCpu=[0-9]* \
| awk -F\= 'BEGIN{x=0}{x+=$2}END{print int(x/86400.)}'`
goodCPU=` grep "^JobStatus=4" $FILE | grep 'ExitCode=0\ ' | grep -o RemoteUserCpu=[0-9]* \
| awk -F\= 'BEGIN{x=0}{x+=$2}END{print int(x/86400.)}'`
badCPU=` grep "^JobStatus=4" $FILE | grep -v 'ExitCode=0\ ' | grep -o RemoteUserCpu=[0-9]* \
| awk -F\= 'BEGIN{x=0}{x+=$2}END{print int(x/86400.)}'`

# need to REMOVE DAG JOBS FROM HERE!!
read nheld heldWC heldCPU <<< $(grep "^JobStatus=5" $FILE | awk -v now=$NOW ' \
BEGIN {
yesterday=now-86400
SumRemoteWallClockTime=0
SumRemoteUserCpu=0
SumHeld=0
}
{
skip=1
for (i=1; i<=NF; i++) {
split($i,subfields,"=")
if (subfields[1]=="EnteredCurrentStatus") {
EnteredCurrentStatus=subfields[2]
if (EnteredCurrentStatus>yesterday) { skip=0 }
}
if (skip==0 && subfields[1]=="RemoteWallClockTime") { SumRemoteWallClockTime+=subfields[2] }
if (skip==0 && subfields[1]=="RemoteUserCpu") { SumRemoteUserCpu+=subfields[2] }
}
if ( skip==0 ) { SumHeld+=1 }
}
END{
SumRemoteWallClockTime/=86400.
SumRemoteUserCpu/=86400.
HeldPerJob=SumRemoteWallClockTime/SumHeld*24.
HeldEff=SumRemoteUserCpu/SumRemoteWallClockTime*100.
#printf "Held %10i %10.1f %10.1f %10.1f\n",SumHeld,SumRemoteWallClockTime,HeldPerJob,HeldEff
print SumHeld
print SumRemoteWallClockTime
print SumRemoteUserCpu
}
')

ntotal=` echo $nabort $ngood $nbad $nheld | awk '{print $1+$2+$3+$5}'`
totalWC=` echo $abortWC $goodWC $badWC $heldWC | awk '{print $1+$2+$3+$5}'`
totalCPU=`echo $abortCPU $goodCPU $badCPU $heldCPU | awk '{print $1+$2+$3+$5}'`

abortpct=`echo $abortWC $totalWC | awk '{if($2>0){print $1/$2*100.0}else{print 0}}'`
goodpct=` echo $goodWC $totalWC | awk '{if($2>0){print $1/$2*100.0}else{print 0}}'`
badpct=` echo $badWC $totalWC | awk '{if($2>0){print $1/$2*100.0}else{print 0}}'`
heldpct=` echo $heldWC $totalWC | awk '{if($2>0){print $1/$2*100.0}else{print 0}}'`

aborteff=` echo $abortCPU $abortWC | awk '{if($2>0){print $1/$2*100.0}else{print 0}}'`
goodeff=` echo $goodCPU $goodWC | awk '{if($2>0){print $1/$2*100.0}else{print 0}}'`
badeff=` echo $badCPU $badWC | awk '{if($2>0){print $1/$2*100.0}else{print 0}}'`
heldeff=` echo $heldCPU $heldWC | awk '{if($2>0){print $1/$2*100.0}else{print 0}}'`
totaleff=` echo $totalCPU $totalWC | awk '{if($2>0){print $1/$2*100.0}else{print 0}}'`

abortperjob=`echo $abortWC $nabort | awk '{if($2>0){print $1/$2*24.}else{print 0}}'`
goodperjob=` echo $goodWC $ngood | awk '{if($2>0){print $1/$2*24.}else{print 0}}'`
badperjob=` echo $badWC $nbad | awk '{if($2>0){print $1/$2*24.}else{print 0}}'`
heldperjob=` echo $heldWC $nheld | awk '{if($2>0){print $1/$2*24.}else{print 0}}'`
totalperjob=`echo $totalWC $ntotal | awk '{if($2>0){print $1/$2*24.}else{print 0}}'`

echo
echo "SUMMARY TABLE OF JOBS WHICH COMPLETED IN THE PAST 24 HOURS (not including Held jobs):"
echo "SUMMARY TABLE OF JOBS WHICH COMPLETED IN THE PAST 24 HOURS:"
echo
printf "Job Result %10s %10s %10s %10s %10s\n" "Number" "WC(d)" "WC(%)" "WC/job(h)" "CPU/WC(%)"
printf "ExitCode=0 %10s %10s %10.1f %10.1f %10.1f\n" $ngood $goodWC $goodpct $goodperjob $goodeff
printf "ExitCode!=0 %10s %10s %10.1f %10.1f %10.1f\n" $nbad $badWC $badpct $badperjob $badeff
printf "Aborted %10s %10s %10.1f %10.1f %10.1f\n" $nabort $abortWC $abortpct $abortperjob $aborteff
printf "Job Result %10s %10s %10s %10s %10s\n" "Number" "WC(d)" "WC(%)" "WC/job(h)" "CPU/WC(%)"
printf "ExitCode=0 %10.0f %10.0f %10.1f %10.1f %10.1f\n" $ngood $goodWC $goodpct $goodperjob $goodeff
printf "ExitCode!=0 %10.0f %10.0f %10.1f %10.1f %10.1f\n" $nbad $badWC $badpct $badperjob $badeff
printf "Removed %10.0f %10.0f %10.1f %10.1f %10.1f\n" $nabort $abortWC $abortpct $abortperjob $aborteff
printf "Held %10.0f %10.0f %10.1f %10.1f %10.1f\n" $nheld $heldWC $heldpct $heldperjob $heldeff
echo
printf "Sum %10.0f %10.0f %10.1f %10.1f %10.1f\n" $ntotal $totalWC "100" $totalperjob $totaleff


echo
echo EXIT CODE BREAKDOWN OF COMPLETED JOBS:
Expand Down Expand Up @@ -101,6 +156,8 @@ echo HELD JOBS IN THE PAST 24 HOURS:
echo
printf "%-20s %8s %8s %8s %10s\n" "Site" "Held Jobs" "Users" "Pilots" "WC(d)"



grep "^JobStatus=5" $FILE | awk -v now=$NOW ' \
{
MATCH_GLIDEIN_CMSSite=unknown
Expand Down Expand Up @@ -168,10 +225,13 @@ END {
}
' | grep ^T | sort

exit

echo
echo
echo USER PRIORITIES:
echo
condor_userprio -allusers -all -pool $POOLNAME
#condor_userprio -allusers -all -pool $POOLNAME
condor_userprio -all -pool $POOLNAME

exit
2 changes: 1 addition & 1 deletion condor_history_dump.sh
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/bin/sh
source /home/letts/scripts/condor_functions.sh
POOLNAME="glidein-collector-2.t2.ucsd.edu"

OUTFILE=/crabprod/CSstoragePath/Monitor/monitor-anaops-history-`/bin/date +%F-Z%R -u`.txt
Expand All @@ -9,4 +8,5 @@ mv ${OUTFILE}.tmp $OUTFILE
OUTFILE=/crabprod/CSstoragePath/Monitor/latest-history.txt
/home/letts/scripts/condor_history_analyze.sh $POOLNAME > ${OUTFILE}.tmp
mv ${OUTFILE}.tmp $OUTFILE

exit
Empty file modified dashboard_functions.sh
100644 → 100755
Empty file.
Empty file modified sitedb_functions.sh
100644 → 100755
Empty file.

0 comments on commit 0010ca2

Please sign in to comment.