Updated.

jamesletts · May 21, 2014 · 9fa3435 · 9fa3435
1 parent fd1d652
commit 9fa3435
Show file tree

Hide file tree

Showing 4 changed files with 166 additions and 25 deletions.
diff --git a/README.md b/README.md
@@ -22,3 +22,8 @@ Scripts:
 
    * condor_history_dump.sh:     Dumps schedd ClassAds for recently run jobs and the current queue.
    * condor_history_analyze.sh:  Analyzes the dump of the schedd ClassAds.
+
+
+To do:
+
+   * Switch to using SiteDB API when possible? https://github.com/dmwm/WMCore/blob/master/src/python/WMCore/Services/SiteDB/SiteDB.py
diff --git a/condor_check b/condor_check
@@ -7,48 +7,60 @@ else
   source $glideinWMSMonitor_RELEASE_DIR/bashrc
 fi
 
-POOLNAME=$1
-SHORT=$2
+COLLECTOR1=$1
+COLLECTOR2=$2
+# Collector2 could be discovered, but not at CERN somehow.
+SHORT=$3
 if [ "X"$SHORT == "X" ] ; then SHORT="long" ; fi
 
 # Header
-echo Summary Table for glidinWMS pool $POOLNAME at `/bin/date -u`
+echo Summary Table for glidinWMS pool $COLLECTOR1 at `/bin/date -u`
 echo
 
 # Summary Table from the Collector
-condor_status -pool $POOLNAME -schedd || exit 2
+condor_status -pool $COLLECTOR1 -schedd || exit 2
 echo
 
 # Last Negotiation Cycle time
-negotime=`condor_status -pool $POOLNAME -nego -l | grep LastNegotiationCycleDuration0 | awk '{print $3}'` || exit 3
-echo "Negotiation time = ${negotime}s"
+negotime=`condor_status -pool $COLLECTOR1 -nego -l | grep LastNegotiationCycleDuration0 | awk '{print $3}'` || exit 3
+echo "Negotiation time (ideally under 300s) = ${negotime}s"
 echo
 
 # dont print the long table if asked
 if [ $SHORT == "short" ] ; then 
   exit 0 
 fi
 
+total1=`condor_status -schedd -total -pool $COLLECTOR1 | tail -1 | awk '{print int($2)}'`
+total2=`condor_status -schedd -total -pool $COLLECTOR2 | tail -1 | awk '{print int($2)}'`
+colldiff=`echo $total1 $total2 | awk '{print int($1-$2)}'`
+echo "Difference between collector total job counts: $total1 ($COLLECTOR1) - $total2 ($COLLECTOR2) = $colldiff"
+echo
+
+
 # get information from sitedb about pledges and se names by CMSSite name
 PLEDGES=`get_pledges_from_sitedb`
 SEDFILE=`translate_se_names_in_sitedb_to_cmssite`
 
-# get information from dashboard about avg and max usage by CMSSite name
-USAGE=`dashboard_usage_by_site analysis "3 months ago"`
-MAXUSAGE=`dashboard_usage_by_site all "3 months ago"`
-
+# Get information from dashboard about avg and max usage by CMSSite name
 # Information from T2_US_Wisconsin before April 3, 2014 is inflated.
-USAGEUW=`dashboard_usage_by_site analysis "3 weeks ago"`
-MAXUSAGEUW=`dashboard_usage_by_site all "3 weeks ago"`
+#USAGE=`dashboard_usage_by_site analysis "3 months ago"`
+#MAXUSAGE=`dashboard_usage_by_site all "3 months ago"`
+#USAGEUW=`dashboard_usage_by_site analysis "3 weeks ago"`
+#MAXUSAGEUW=`dashboard_usage_by_site all "3 weeks ago"`
+USAGE=`dashboard_usage_by_site analysis "1 month ago"`
+MAXUSAGE=`dashboard_usage_by_site all "1 month ago"`
+USAGEUW=$USAGE
+MAXUSAGEUW=$MAXUSAGE
 
 # get information from SSB about site downtimes
 DOWNTIMES=`site_downtimes_from_ssb`
 
 # get claimed and running pilots, DESIRED_Sites for each pool:
-CLAIMED=`get_pilots_by_site $POOLNAME -claimed` || exit 4
-RUNNING=`get_pilots_by_site $POOLNAME`          || exit 5
-DESIRED=`get_DESIRED_Sites $POOLNAME`           || exit 6
-RUNNINGPROD=`get_pilots_by_site "vocms97.cern.ch"` || exit 7
+CLAIMED=`get_pilots_by_site $COLLECTOR1 -claimed` || exit 4
+RUNNING=`get_pilots_by_site $COLLECTOR1`          || exit 5
+DESIRED=`get_DESIRED_Sites  $COLLECTOR1`          || exit 6
+RUNNINGPROD=`get_pilots_by_site "vocms97.cern.ch"`|| exit 7
 
 # Print the table of pilots and pressure from queued jobs for each site
 printf "%-20s%10s%10s%10s%10s%10s%10s%10s%10s%10s  %-18s\n" "Site" "Pledge" "Pledged" "Average" "Maximum" "Claimed" "Unclaimed" "Pressure" "Exclusive" "Running" "Maintenance"
@@ -151,11 +163,10 @@ cat << EOF
 Notes:
    * Pledges are 50% of the last pledge entered in SiteDB for the site. Tier-1 pledges are set to zero for
      analysis, even though analysis jobs can run at the Tier-1 sites.
-   * Usage statistics are from the last three months in Dashboard for analysis only.
-      * Usage numbers from Wisconsin in March are overinflated in the Dashboard before April 2014 so only the
-        past two weeks is used.
-   * The Site Table does not include DAG jobs (from CRAB3) which do not run at a DESIRED_Site, but rather on the 
-     schedd.
+   * Average Usage statistics are from the last month in Dashboard for activity=analysis only.
+   * Maximum Usage statistics are from the last month in Dashboard for all activities including production.
+   * The Site Table does not include DAG jobs (from CRAB3) which do not run at a DESIRED_Site, but rather 
+     on the schedd.
    * Sites are only listed in the Site Table if there is demand (running or queued) or pledged resources.
 EOF
 

diff --git a/condor_check.sh b/condor_check.sh
@@ -20,7 +20,9 @@ cat >> $OUTFILE <<EOF
 
 EOF
 # run analysis of analysis ops pool, with a time limit of 300s.
-alarm 300 $glideinWMSMonitor_RELEASE_DIR/condor_check glidein-collector-2.t2.ucsd.edu >> $OUTFILE
+COLLECTOR1=glidein-collector-2.t2.ucsd.edu
+COLLECTOR2=glidein-collector.t2.ucsd.edu
+alarm 400 $glideinWMSMonitor_RELEASE_DIR/condor_check $COLLECTOR1 $COLLECTOR2 >> $OUTFILE
 rc=$?
 
 cat >> $OUTFILE <<EOF
@@ -30,21 +32,27 @@ cat >> $OUTFILE <<EOF
 
 EOF
 # run analysis of global pool, with a time limit of 300s.
-alarm 300 $glideinWMSMonitor_RELEASE_DIR/condor_check vocms097.cern.ch short >> $OUTFILE
+COLLECTOR1=vocms097.cern.ch
+COLLECTOR2=vocms099.cern.ch
+alarm 100 $glideinWMSMonitor_RELEASE_DIR/condor_check $COLLECTOR1 $COLLECTOR2 short >> $OUTFILE
 
 cat >> $OUTFILE <<EOF
 
 
 ===================================================== PRODUCTION POOL =====================================================
 
 EOF
-alarm 300 $glideinWMSMonitor_RELEASE_DIR/condor_check vocms97.cern.ch short >> $OUTFILE
+COLLECTOR1=vocms97.cern.ch
+COLLECTOR2=unknown
+alarm 100 $glideinWMSMonitor_RELEASE_DIR/condor_check $COLLECTOR1 $COLLECTOR2 short >> $OUTFILE
 
-# if everything ran correctly, then update the latest file:
 if [ $rc -eq 0 ] ; then
   LINKNAME=$glideinWMSMonitor_OUTPUT_DIR/latest.txt
   rm $LINKNAME
   ln -s $OUTFILE $LINKNAME
 fi
 
+# make a nice html page as a test:
+$glideinWMSMonitor_RELEASE_DIR/make_html_page.sh
+
 exit 0
diff --git a/dashboard_functions.sh b/dashboard_functions.sh
@@ -72,3 +72,120 @@ dashboard_usage_by_site() {
   echo "$OUTPUTFILE"
   return 0
 }
+#!/bin/bash
+
+dashboard_exit_status() {
+  # print the exit status of jobs from the dashboard by site
+  #
+  # Usage:
+  #   dashboard_exit_status begin_date end_date activity
+  #      where dates are in the form YYYY-mm-dd and activity=[analysis|production|all|analysistest]
+  #
+  # Output:
+  #   csv list by site: app-unknown,app-successful,app-failed,site-failed,cancelled,aborted,completed,site
+
+  date1=$1
+  date2=$2
+  activity=$3
+  url="http://dashb-cms-jobsmry.cern.ch/dashboard/request.py/jobnumbers_terminatedcsv?sites=All%20T3210&datatiers=All%20DataTiers&applications=All%20Application%20Versions&submissions=All%20Submission%20Types&accesses=All%20Access%20Types&activities=${activity}&sitesSort=7&start=${date1}&end=${date2}&timeRange=daily&granularity=daily&generic=0&sortBy=0&series=All&type=gstb"
+  curl -ks $url | dos2unix | awk -F\, ' 
+  BEGIN{
+    completed=0
+    appsuccessful=0
+    nsites=0
+  }
+  {
+    completed+=$7
+    appsuccessful+=$2
+    nsites+=1
+  }
+  END{
+    if ( completed > 0 ) { 
+      successrate=appsuccessful/completed*100.0
+    } else {
+      successrate="N/A"
+    }
+    printf("%i,%i,%i,%s\n",nsites,completed,appsuccessful,successrate)
+  }'
+  return 0
+}
+
+dashboard_job_slots_used() {
+  # print the jobs slots used per day from the dashboard by site
+  #
+  # Usage:
+  #   dashboard_job_slots_used begin_date end_date activity
+  #      where dates are in the form YYYY-mm-dd and activity=[analysis|production|all|analysistest]
+  #
+  # Output:
+  #   csv list by site: app-unknown,app-successful,app-failed,site-failed,cancelled,aborted,completed,site
+  date1=$1
+  date2=$2
+  activity=$3
+
+  url="http://dashb-cms-jobsmry.cern.ch/dashboard/request.py/jobnumberscsv?sites=All%20T3210&datatiers=All%20DataTiers&applications=All%20Application%20Versions&submissions=All%20Submission%20Types&accesses=All%20Access%20Types&activities=${activity}&sitesSort=7&start=${date1}&end=${date2}&timeRange=daily&granularity=daily&generic=0&sortBy=0&series=All&type=r"
+
+  curl -ks $url | dos2unix | sort -t \, -k 3 | awk -F\, '
+  BEGIN{
+    lastsite="None"
+    totaljobslots=0
+    totaldays=0
+  }
+  {
+    site=$3
+    if ( site != lastsite && lastsite != "None" ){
+      if ( totaldays > 0 ) {
+        slotsperday=int(totaljobslots/totaldays)
+      } else {
+        slotsperday=0
+      }
+      printf("%s,%i\n",lastsite,slotsperday)
+      totaljobslots=0
+      totaldays=0
+    }
+    lastsite=site
+    totaljobslots+=$1
+    totaldays+=1
+  }
+  END{
+    if ( totaldays > 0 ) {
+      slotsperday=int(totaljobslots/totaldays)
+    } else {
+      slotsperday=0
+    }
+    printf("%s,%i\n",site,slotsperday)
+  }'
+  return 0
+}
+
+dashboard_report() {
+#
+#
+# ARGS: Granularity in days of the time period for the table
+# ARGS: Number of time periods to display
+GRANULARITY=$1
+NUMBER_OF_PERIODS=$2
+echo
+echo "Analysis Jobs Report for the past $NUMBER_OF_PERIODS periods of $GRANULARITY days."
+echo
+printf "%10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s\n" \
+"Begin" "End" "Analysis" "Analysis"      "All" "Analysis" "Analysis" "Analysis" "Number"   "Number"  "App-"
+printf "%10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s\n" \
+"Date" "Date" "Job Slots"    "Test" "Activity" "Job Slots" "Job Slots" "Job Slots"    "of Sites" "of Jobs" "Success"
+printf "%10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s\n" \
+"    " "    " "Used"  "Job Slots"  "Job Slots" "at T1"    "at T2"    "at T3"    " " "Completed" "Rate"
+date1=`date -dlast-monday +%F`
+for (( i=1; i<=$NUMBER_OF_PERIODS; i++ )) ; do
+  date2=$date1
+  date1=`date -d "$date2 -$GRANULARITY days" +%F`
+  printf "%10s %10s " $date1 $date2
+  dashboard_job_slots_used $date1 $date2 analysis     | awk -F, 'BEGIN{x=0}{x+=$2}END{printf("%10i ",x)}'
+  dashboard_job_slots_used $date1 $date2 analysistest | awk -F, 'BEGIN{x=0}{x+=$2}END{printf("%10i ",x)}'
+  dashboard_job_slots_used $date1 $date2 all          | awk -F, 'BEGIN{x=0}{x+=$2}END{printf("%10i ",x)}'
+  dashboard_job_slots_used $date1 $date2 analysis     | awk -F, 'BEGIN{x=0}{if($1~/^T1/){x+=$2}}END{printf("%10i ",x)}'
+  dashboard_job_slots_used $date1 $date2 analysis     | awk -F, 'BEGIN{x=0}{if($1~/^T2/){x+=$2}}END{printf("%10i ",x)}'
+  dashboard_job_slots_used $date1 $date2 analysis     | awk -F, 'BEGIN{x=0}{if($1~/^T3/){x+=$2}}END{printf("%10i ",x)}'
+  dashboard_exit_status    $date1 $date2 analysis     | awk -F, '{printf("%10i %10i %10.1f%\n",$1,$2,$4)}'
+done
+return
+}