From 86195b5d3d8f3f8f4b99b23f01bcb1533a4b34ef Mon Sep 17 00:00:00 2001 From: JAMES LETTS Date: Mon, 12 Jan 2015 15:38:49 -0800 Subject: [PATCH] Various updated. --- JobExitCodes | 659 ++++++++++++++++++++++++++++++++ condor_check.sh | 21 +- condor_history_dump.sh | 8 +- dev/condor_functions.sh | 144 ------- dev/dashboard_functions.sh | 246 ------------ dev/monitor.sh | 30 -- dev/monitor_functions.sh | 117 ------ dev/sitedb_functions.sh | 144 ------- multi-core.py | 12 +- select_overflow_from_history.sh | 2 + 10 files changed, 681 insertions(+), 702 deletions(-) create mode 100644 JobExitCodes delete mode 100644 dev/condor_functions.sh delete mode 100644 dev/dashboard_functions.sh delete mode 100755 dev/monitor.sh delete mode 100755 dev/monitor_functions.sh delete mode 100644 dev/sitedb_functions.sh diff --git a/JobExitCodes b/JobExitCodes new file mode 100644 index 0000000..83b3bd2 --- /dev/null +++ b/JobExitCodes @@ -0,0 +1,659 @@ + + + + + + + + + JobExitCodes < CMSPublic < TWiki + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+
+
+
+

Error codes currently sent from CMS jobs to the dashboard

+

+ALERT! indicates site error +

    +
  • Error exit code of the cmsRun application itself - range 0-10000 (Exit codes in 1-512 are standard ones in Unix and indicate a CMSSW abort that the cmsRun did not catch as exception)
      +
    • -1 - Error return without specification +
    • 1 - Hangup (POSIX) +
    • 2 - Interrupt (ANSI) +
    • 3 - unknown +
    • 4 - Illegal instruction (ANSI) +
    • 5 - Trace trap (POSIX) +
    • 6 - Abort (ANSI) or IOT trap (4.2BSD) +
    • 7 - BUS error (4.2BSD) +
    • 8 - Floating point exception (ANSI) +
    • 9 - killed, unblockable (POSIX) kill -9 +
    • 10 - User defined +
    • 11 - segmentation violation (ANSI) +
    • 12 - User defined +
    • 15 - Termination (ANSI) +
    • 24 - Soft CPU limit exceeded (4.2 BSD) +
    • 25 - File size limit exceeded (4.2 BSD) +
    • 29 - nondefined +
    • 30 - Power failure restart (System V.) +
    • 33 - nondefined +
    • 64 - I/O error: cannot open data file (SEAL) +
    • 65 - End of job from user application (CMSSW) +
    • 66 - Application exception +
    • 67 - unknown +
    • 68 - unknown +
    • 73 - Failed writing to read-only file system +
    • 84 - Some required file not found; check logs for name of missing file. +
    • 88 - unknown +
    • 90 - Application exception +
    • 100 - nondefined +
    • 126 - unknown +
    • 127 - Error while loading shared library +
    • 129 - Hangup (POSIX) +
    • 132 - Illegal instruction (ANSI) +
    • 133 - Trace trap (POSIX) +
    • 134 - Abort (ANSI) or IOT trap (4.2 BSD) +
    • 135 - Bus error (4.2 BSD) +
    • 136 - unknown +
    • 137 - killed, unblockable (POSIX) kill -9 +
    • 138 - User defined +
    • 139 - Segmentation violation (ANSI) +
    • 140 - User defined +
    • 143 - Termination (ANSI) +
    • 152 - CPU limit exceeded (4.2 BSD) +
    • 153 - File size limit exceeded (4.2 BSD) +
    • 155 - Profiling alarm clock (4.2 BSD) +
    • 251 - nondefined +
    • 255 - nondefined +
    • 256 - Application exception +
    • 512 - nondefined +
    • 2304 - nondefined +
    • 0001 - Plug-in or message service initialization Exception +
    +
+

    +
  • cmsRun (CMSSW) exit codes. These codes may depend on specific CMSSW version, the list is maintained in CVS and you should look at tags there to find out what is appropriate for a given CMSSW release. The situation as of 5_0_X is below
      +
    • // The first three are specific categories of CMS Exceptions. +
    • 7000 - Exception from command line processing +
    • 7001 - Configuration File Not Found +
    • 7002 - Configuration File Read Error +
    • 8001 - Other CMS Exception +
    • 8002 - std::exception (other than bad_alloc) +
    • 8003 - Unknown Exception +
    • 8004 - std::bad_alloc (memory exhaustion) +
    • 8005 - Bad Exception Type (e.g throwing a string) +
    • // The rest are specific categories of CMS Exceptions. +
    • 8006 - ProductNotFound +
    • 8007 - DictionaryNotFound +
    • 8008 - InsertFailure +
    • 8009 - Configuration +
    • 8010 - LogicError +
    • 8011 - UnimplementedFeature +
    • 8012 - InvalidReference +
    • 8013 - NullPointerError +
    • 8014 - NoProductSpecified +
    • 8015 - EventTimeout +
    • 8016 - EventCorruption +
    • 8017 - ScheduleExecutionFailure +
    • 8018 - EventProcessorFailure +
    • 8019 - FileInPathError +
    • ALERT! 8020 - FileOpenError (Likely a site error) +
    • ALERT! 8021 - FileReadError (May be a site error) +
    • 8022 - FatalRootError +
    • 8023 - MismatchedInputFiles +
    • 8024 - ProductDoesNotSupportViews +
    • 8025 - ProductDoesNotSupportPtr +
    • 8026 - NotFound (something other than a product or dictionary not found) +
    • 8027 - FormatIncompatibility +
    • ALERT! 8028 - FileOpenError with fallback +
    +
  • ALERT! Failures related to the environment setup - range 10000-19999
      +
    • ALERT! 10001 - LD_LIBRARY_PATH is not defined +
    • ALERT! 10002 - Failed to setup LCG_LD_LIBRAR_PATH +
    • ALERT! 10016 - OSG $WORKING_DIR could not be created +
    • ALERT! 10017 - OSG $WORKING_DIR could not be deleted +
    • ALERT! 10018 - OSG $WORKING_DIR could not be deleted +
    • ALERT! 10020 - Shell script cmsset_default.sh to setup cms environment is not found +
    • ALERT! 10021 - Failed to scram application project using the afs release area +
    • ALERT! 10022 - Failed to scram application project using CMS sw disribution on the LCG2 +
    • ALERT! 10030 - middleware not identified +
    • ALERT! 10031 - Directory VO_CMS_SW_DIR not found +
    • ALERT! 10032 - Failed to source CMS Environment setup script such as cmssset_default.sh, grid system or site equivalent script +
    • ALERT! 10033 - Platform is incompatible with the scram version +
    • ALERT! 10034 - Required application version is not found at the site +
    • ALERT! 10035 - Scram Project Command Failed +
    • ALERT! 10036 - Scram Runtime Command Failed +
    • ALERT! 10037 - Failed to find cms_site_config file in software area +
    • ALERT! 10038 - Failed to find cms_site_catalogue.sh file in software area +
    • ALERT! 10039 - cms_site_catalogue.sh failed to provide the catalogue +
    • ALERT! 10040 - failed to generate cmsRun cfg file at runtime +
    • ALERT! 10041 - fail to find valid client for output stage out +
    • ALERT! 10042 - Unable to stage-in wrapper tarball. +
    • ALERT! 10043 - Unable to bootstrap WMCore libraries (most likely site python is broken). +
    +
+

    +
  • Executable file related failures - range 50000-59999
      +
    • 50110 - Executable file is not found +
    • 50111 - Executable file has no exe permissions +
    • 50112 - User executable shell file is not found +
    • 50113 - Executable did not get enough arguments +
    • 50114 - OSG $WORKING_DIR could not be deleted +
    • 50115 - cmsRun did not produce a valid job report at runtime (often means cmsRun segfaulted) +
    • 50116 - Could not determine exit code of cmsRun executable at runtime +
    • 50117 - Could not update exit code in job report (a variation of 50115) +
    • 50513 - Failure to run SCRAM setup scripts +
    • 50660 - Application terminated by wrapper because using too much RAM (RSS) +
    • 50661 - Application terminated by wrapper because using too much Virtual Memory (VSIZE) +
    • 50662 - Application terminated by wrapper because using too much disk +
    • 50663 - Application terminated by wrapper because using too much CPU time +
    • 50664 - Application terminated by wrapper because using too much Wall Clock time +
    • 50669 - Application terminated by wrapper for not defined reason +
    • 50700 - Job Wrapper did not produce any usable output file +
    • 50800 - Application segfaulted (likely user code problem) +
    • 50998 - Problem calculating file details (i.e. size, checksum etc) +
    • 50999 - OSG $WORKING_DIR could not be deleted +
    +
+

    +
  • Staging-OUT related troubles- range 60000-69999
      +
    • 60300 - Either OutputSE or OutputSE_DIR not defined +
    • 60301 - Neither zip nor tar exists +
    • 60302 - Output file(s) not found +
    • 60303 - File already exists on the SE +
    • 60304 - Failed to create the summary file (production) +
    • 60305 - Failed to create a zipped archive (production) +
    • 60306 - Failed to copy and register output file +
    • 60307 - Failed to copy an output file to the SE (sometimes caused by timeout issue). +
    • 60308 - An output file was saved to fall back local SE after failing to copy to remote SE +
    • 60309 - Failed to create an output directory in the catalogue +
    • 60310 - Failed to register an output file in the catalogue +
    • ALERT! 60311 - Stage Out Failure in ProdAgent job +
    • 60312 - Failed to get file TURL via lcg-lr command +
    • 60313 - Failed to delete the output from the previous run via lcg-del command +
    • 60314 - Failed to invoke ProdAgent StageOut Script +
    • ALERT! 60315 - ProdAgent StageOut initialisation error (Due to TFC, SITECONF etc) +
    • 60316 - Failed to create a directory on the SE +
    • 60317 - Forced timeout for stuck stage out +
    • 60318 - Internal error in Crab cmscp.py stageout script +
    • 60319 - Failure to do asynchronous stageout +
    • 60401 - Failure to assemble LFN in direct-to-merge by size (WMAgent) +
    • 60402 - Failure to assemble LFN in direct-to-merge by event (WMAgent) +
    • 60403 - Timeout during attempted file transfer - status unknown (WMAgent) +
    • 60404 - Timeout during staging of log archives - status unknown (WMAgent) +
    • 60405 - General failure to stage out log archives (WMAgent) +
    • 60406 - Failure in staging in log files during log collection (WMAgent) +
    • 60407 - Timeout in staging in log files during log collection (WMAgent) +
    • 60408 - Failure to stage out of log files during log collection (WMAgent) +
    • 60409 - Timeout in stage out of log files during log collection (WMAgent) +
    • 60410 - Failure in deleting log files in log collection (WMAgent) +
    • 60411 - Timeout in deleting log files in log collection (WMAgent) +
    • 60451 - Output file lacked adler32 checksum (WMAgent) +
    • 60452 - No run/lumi information in file (WMAgent) +
    • 60999 - SG $WORKING_DIR could not be deleted +
    • 61101 - No sites are available to submit the job because the location of its input(s) do not pass the site whitelist/blacklist restrictions (WMAgent) +
    • 61102 - The job can only run at a site that is currently in Aborted state (WMAgent) +
    • 61103 - The JobSubmitter component could not load the job pickle (WMAgent) +
    • 61104 - The job can run only at a site that is currently in Draining state (WMAgent) +
    • 61300 - The job was killed by the WMAgent, reason is unknown (WMAgent) +
    • 61301 - The job was killed by the WMAgent because the site it was running at was set to Aborted (WMAgent) +
    • 61302 - The job was killed by the WMAgent because the site it was running at was set to Draining (WMAgent) +
    • 61303 - The job was killed by the WMAgent because the site it was running at was set to Down (WMAgent) +
    +
+

    +
  • Problems saving output via output sandbox- range 70000-70009
      +
    • 70000 - Output_sandbox too big for WMS: output can not be retrieved +
    • 70500 - Warning: problem with ModifyJobReport +
    +
+

    +
  • Other problems
      +
    • 99109 - Uncaught exception in WMAgent step executor +
    +
+
+
+ +
Topic revision: r70 - 05 May 2014 - StefanoBelforte
+
+
+
+
 
+
+ +
+
    +
  • +
    • Cern Search Icon Cern Search
    • TWiki Search Icon TWiki Search
    • Google Search Icon Google Search

    CMSPublic All webs +
+

+ + +

+

+ + +

    +
  • +
+
This site is powered by the TWiki collaboration platform Powered by PerlCopyright 1999 - 2013 by the contributing authors. All material on this collaboration platform is the property of the contributing authors.
Ideas, requests, problems regarding TWiki? Send feedback
+
+
+
+ \ No newline at end of file diff --git a/condor_check.sh b/condor_check.sh index a2378df..39840dc 100755 --- a/condor_check.sh +++ b/condor_check.sh @@ -16,26 +16,25 @@ OUTFILE=$glideinWMSMonitor_OUTPUT_DIR/monitor-anaops-`/bin/date +%F-Z%R -u`.txt cat >> $OUTFILE <> $OUTFILE -rc=$? cat >> $OUTFILE <> $OUTFILE - cat >> $OUTFILE <> $OUTFILE <> $OUTFILE -#alarm 600 $glideinWMSMonitor_RELEASE_DIR/condor_check $COLLECTOR1 $COLLECTOR2 >> $OUTFILE +#alarm 600 $glideinWMSMonitor_RELEASE_DIR/condor_check $COLLECTOR1 $COLLECTOR2 short >> $OUTFILE +alarm 600 $glideinWMSMonitor_RELEASE_DIR/condor_check $COLLECTOR1 $COLLECTOR2 >> $OUTFILE #if [ $rc -eq 0 ] ; then LINKNAME=$glideinWMSMonitor_OUTPUT_DIR/latest.txt diff --git a/condor_history_dump.sh b/condor_history_dump.sh index 8b82cd3..abe5f3c 100755 --- a/condor_history_dump.sh +++ b/condor_history_dump.sh @@ -27,10 +27,10 @@ $glideinWMSMonitor_RELEASE_DIR/condor_history_analyze.sh $POOLNAME > ${OUTFILE}. mv ${OUTFILE}.tmp $OUTFILE -POOLNAME="glidein-collector-2.t2.ucsd.edu" -OUTFILE=$glideinWMSMonitor_OUTPUT_DIR/latest-overflow.txt -$glideinWMSMonitor_RELEASE_DIR/condor_history_analyze_overflow.sh $POOLNAME > ${OUTFILE}.tmp -mv ${OUTFILE}.tmp $OUTFILE +#POOLNAME="glidein-collector-2.t2.ucsd.edu" +#OUTFILE=$glideinWMSMonitor_OUTPUT_DIR/latest-overflow.txt +#$glideinWMSMonitor_RELEASE_DIR/condor_history_analyze_overflow.sh $POOLNAME > ${OUTFILE}.tmp +#mv ${OUTFILE}.tmp $OUTFILE POOLNAME="vocms097.cern.ch" OUTFILE=$glideinWMSMonitor_OUTPUT_DIR/latest-global-history.txt diff --git a/dev/condor_functions.sh b/dev/condor_functions.sh deleted file mode 100644 index b78bdc4..0000000 --- a/dev/condor_functions.sh +++ /dev/null @@ -1,144 +0,0 @@ -#!/bin/bash - -getClassAds() { - # Function to dump a set of ClassAds for queued, running and jobs - # from the past 24h of condor history. If the command fails remotely, - # then one can try to gsissh to the node to execute the query. - # - # Usage: - # getClassAds $POOLNAME $SCHEDDNAME $MACHINENAME "condor_history" - # getClassAds $POOLNAME $SCHEDDNAME $MACHINENAME "condor_q" - # Output: - # Space separated list of job ClassAds, one row per job. - # - # Called from condor_history_dump - - POOLNAME=$1 ; shift - SCHEDDNAME=$1 ; shift - MACHINENAME=$1 ; shift - - NOW=`/bin/date +%s` - YESTERDAY=$[$NOW-86400] - command="$@ \ - -const '(EnteredCurrentStatus>$YESTERDAY)' \ - -format 'JobStatus=%i\ ' JobStatus \ - -format 'LastJobStatus=%i\ ' LastJobStatus \ - -format 'ExitCode=%i\ ' ExitCode \ - -format 'EnteredCurrentStatus=%i\ ' EnteredCurrentStatus \ - -format 'ImageSize=%i\ ' ImageSize \ - -format 'RemoteWallClockTime=%i\ ' RemoteWallClockTime \ - -format 'RemoteUserCpu=%i\ ' RemoteUserCpu \ - -format 'LastRemoteHost=%s\ ' LastRemoteHost \ - -format 'MATCH_GLIDEIN_CMSSite=%s\ ' MATCH_GLIDEIN_CMSSite \ - -format 'DESRIED_Sites=%s\ ' DESIRED_Sites \ - -format 'DESRIED_SEs=%s\ ' DESIRED_SEs \ - -format 'Owner=%s\ ' Owner \ - -format 'AccountingGroup=%s\ ' AccountingGroup \ - -format 'Iwd=%s\ ' Iwd \ - -format 'HoldReasonCode=%i\ ' HoldReasonCode \ - -format 'HoldReasonSubCode=%i\ ' HoldReasonSubCode \ - -format 'HoldReason=%s\ ' HoldReason \ - -format 'GlobalJobId=%s\\n' GlobalJobId" - eval $command -pool $POOLNAME -name $SCHEDDNAME || gsissh $MACHINENAME $command - rc=$? - return $rc -} - -get_pilots_by_site() { - # Function to list the number of pilots running per site. - # - # Usage: - # get_pilots_by_site POOLNAME [optional args for condor_status] - # Output: - # File name of temporary file containing the numbers of pilots - # running at CMSSites, one line per site. - - PILOTS=`mktemp -t PILOTS.txt.XXXXXXX` || return 1 - condor_status -pool $@ -format '{%s}\n' GLIDEIN_CMSSite | sort | uniq -c > $PILOTS || return 2 - echo $PILOTS - return 0 -} - -get_DESIRED_Sites() { - # Get all queued jobs' DESIRED_Sites, translating from DESIRED_SEs - # if needed (i.e. for CRAB2). If DESIRED_Sites exists, take that. - # Otherwise take DESIRED_SEs and translate using SEDFILE from SiteDB. - # Note that DAG jobs do not have DESIRED_Sites defined since they - # run on a schedd and are not counted here. - # - # Usage: - # get_DESIRED_Sites $POOLNAME - # Output: - # File name of temporary file containing the list of DESIRES_Sites, - # one line per job. - - POOLNAME=$1 - - source $glideinWMSMonitor_RELEASE_DIR/sitedb_functions.sh - SEDFILE=`translate_se_names_in_sitedb_to_cmssite` - - SCHEDDS=`condor_status -pool $POOLNAME -const '(TotalIdleJobs>0)' -schedd -format ' -name %s' Name ` || return 1 - DESIRED=`mktemp -t DESIRED.txt.XXXXXXX` || return 2 - - # run condor_q if there are queued jobs in the pool only: - if [ `echo $SCHEDDS | wc -w` -ne 0 ] ; then - condor_q $SCHEDDS -pool $POOLNAME -const '(JobStatus=?=1)' \ - -format '%s' DESIRED_Sites -format ' %s' DESIRED_SEs -format ' %s\n' Owner \ - | sed 's/undefined//g' | awk '{print $1}' | sed -f $SEDFILE >> $DESIRED - fi - - echo $DESIRED - rm $SEDFILE - return 0 -} - -condor_history_dump() { - # Function to dump all ClassAds defined in getClassAds - # for all queued, running and jobs that completed in the - # past day in a glideinWMS pool. - # - # Usage: - # condor_history_dump $POOLNAME - # Output: - # ClassAd values, one line per job. - # - POOLNAME=$1 - SCHEDDS=`condor_status -pool $POOLNAME -schedd -format '%s\,' Name -format '%s\n' Machine` - for SCHEDD in $SCHEDDS ; do - SCHEDDNAME=` echo $SCHEDD | awk -F\, '{print $1}'` - MACHINENAME=`echo $SCHEDD | awk -F\, '{print $2}'` - getClassAds $POOLNAME $SCHEDDNAME $MACHINENAME "condor_history" - getClassAds $POOLNAME $SCHEDDNAME $MACHINENAME "condor_q" - done - return 0 -} - -condor_exit_codes() { - # Function to extract the possible matching CMSSW error codes given the %256 - # ExitCode from HTCondor. The CMSSW exit codes can be found in the $URL but - # it is not publically visible yet. Later you can download periodically to - # the release directory. - # - # Usage: - # condor_error_codes $CONDOR_EXIT_CODE - # Output: - # CMSSW exit code matches and text explanations. - # - CONDOR_EXIT_CODE=$1 - # - # Exit code explanation file - # - FILE=$glideinWMSMonitor_RELEASE_DIR/JobExitCodes - if [ ! -f $FILE ] ; then - URL="https://twiki.cern.ch/twiki/bin/view/CMSPublic/JobExitCodes" - curl -ks -o $FILE $URL - fi - # - # grep the explanation of a particular code(s). - # - grep \- $FILE | grep -o [0-9]*\ \-\ .* | sed 's/<.*>//g' \ - | awk -F\- -v code=$CONDOR_EXIT_CODE '(code==$1%256){print $0}' - return 0 -} - -# Condor Hold Reason Codes: http://research.cs.wisc.edu/htcondor/manual/v7.6/10_Appendix_A.html diff --git a/dev/dashboard_functions.sh b/dev/dashboard_functions.sh deleted file mode 100644 index 720508d..0000000 --- a/dev/dashboard_functions.sh +++ /dev/null @@ -1,246 +0,0 @@ -#!/bin/bash - -site_downtimes_from_ssb() { - # get the downtimes from SSB, remove any quotes so the output - # is just csv list of [site,downtime string] - # - # Usage: - # site_downtimes_from_ssb - # Output: - # File name of temporary file containing the csv list. - - OUTPUTFILE=`mktemp -t DOWNTIMES.csv.XXXXXXXXXX` || return 1 - url="http://dashb-ssb.cern.ch/dashboard/request.py/getallshort?view=maint" - curl -ks -H 'Accept:text/csv' $url | awk -F\, '{print $1 "," $3}' \ - | tr -d \" | tr -d \' | grep OUTAGE > $OUTPUTFILE || return 1 - echo $OUTPUTFILE - return 0 -} - -dashboard_users() { - # Returns the number of users from all activities, except unknown and production user - # By construction this is the number of analysis users. - # - # Usage: - # dashboard_users begin_date end_date - # Output: - # Number of users. - # - - date1=$1 - date2=$2 - tier=$3 - url="http://dashb-cms-job.cern.ch/dashboard/request.py/jobsummary-plot-or-table2?user=&site=&submissiontool=&application=&activity=&status=&check=submitted&tier=${tier}&sortby=user&ce=&rb=&grid=&jobtype=&submissionui=&dataset=&submissiontype=&task=&subtoolver=&genactivity=&outputse=&appexitcode=&accesstype=&date1=${date1}&date2=${date2}&prettyprint" - users=`curl -sk $url | grep \"name\": | grep -v "unknown/cmsdataops" | grep -v "unknown/unknown" | wc -l` - echo $users - return 0 -} - -dashboard_usage_by_site() { - # function to print out a csv list of sites and avg and - # max job slots used daily during the past TIMEFRAME of - # a certain activity, which can be "all", "analysis", - # "analysistest", "production", etc. - # - # Usage: - # dashboard_usage_by_site analysis "3 months ago" - # dashboard_usage_by_site all "2 weeks ago" - # - # Output: - # File name of a temporary file containing the csv list. - # - # N.B. Data before April 3, 2014 for T2_US_Wisconsin is over-inflated! - - OUTPUTFILE=`mktemp -t USAGE.csv.XXXXXXXXXX` || return 1 - - # argument is the activity to list e.g. analysis, production, all - ACTIVITY=$1 - TIMEFRAME=$2 - - # look at the last 3 months: - date1=`date -d "$TIMEFRAME" +%F` - date2=`date +%F` - - # url for dashboard historical usage by site: - url="http://dashb-cms-jobsmry.cern.ch/dashboard/request.py/jobnumberscsv?sites=All%20T3210&datatiers=All%20DataTiers&applications=All%20Application%20Versions&submissions=All%20Submission%20Types&accesses=All%20Access%20Types&activities=${ACTIVITY}&sitesSort=7&start=${date1}&end=${date2}&timeRange=daily&granularity=daily&generic=0&sortBy=0&series=All&type=r" - curl -ks $url | dos2unix | sort -t \, -k 3 | awk -F\, ' - BEGIN{ - lastsite="None" - totaljobslots=0 - totaldays=0 - maxjobslots=0 - } - { - site=$3 - if ( site != lastsite && lastsite != "None" ){ - slotsperday=int(totaljobslots/totaldays) - printf("%s,%i,%i\n",lastsite,slotsperday,maxjobslots) - totaljobslots=0 - totaldays=0 - maxjobslots=0 - } - lastsite=site - totaljobslots+=$1 - totaldays+=1 - if ( $1 > maxjobslots ) { maxjobslots=$1 } - } - END{ - slotsperday=int(totaljobslots/totaldays) - printf("%s,%i,%i\n",site,slotsperday,maxjobslots) - }' > $OUTPUTFILE - echo "$OUTPUTFILE" - return 0 -} - -dashboard_exit_status() { - # print the exit status of jobs from the dashboard by site - # - # Usage: - # dashboard_exit_status begin_date end_date activity sitefilter="T1|T2|T3" - # where dates are in the form YYYY-mm-dd and activity=[analysis|production|all|analysistest] - # - # Output: - # #csv list by site: app-unknown,app-successful,app-failed,site-failed,cancelled,aborted,completed,site - # csv list : nsites,completed,appsuccessful,successrate - - date1=$1 - date2=$2 - activity=$3 - url="http://dashb-cms-jobsmry.cern.ch/dashboard/request.py/jobnumbers_terminatedcsv?sites=All%20T3210&datatiers=All%20DataTiers&applications=All%20Application%20Versions&submissions=All%20Submission%20Types&accesses=All%20Access%20Types&activities=${activity}&sitesSort=7&start=${date1}&end=${date2}&timeRange=daily&granularity=daily&generic=0&sortBy=0&series=All&type=gstb" - curl -ks $url | dos2unix | awk -v sitefilter=$4 -F\, ' - BEGIN{ - completed=0 - appsuccessful=0 - nsites=0 - } - { - if ( sitefilter=="T1" ) { if ( $8!~/^T1/ ) { next } } - if ( sitefilter=="T2" ) { if ( $8!~/^T2/ ) { next } } - if ( sitefilter=="T3" ) { if ( $8!~/^T3/ ) { next } } - completed+=$7 - appsuccessful+=$2 - nsites+=1 - } - END{ - if ( completed > 0 ) { - successrate=appsuccessful/completed*100.0 - } else { - successrate="N/A" - } - printf("%i,%i,%i,%s\n",nsites,completed,appsuccessful,successrate) - }' - return 0 -} - -dashboard_job_slots_used() { - # print the jobs slots used per day from the dashboard by site - # - # Usage: - # dashboard_job_slots_used begin_date end_date activity - # where dates are in the form YYYY-mm-dd and activity=[analysis|production|all|analysistest] - # - # Output: - # csv list by site: app-unknown,app-successful,app-failed,site-failed,cancelled,aborted,completed,site - date1=$1 - date2=$2 - activity=$3 - - url="http://dashb-cms-jobsmry.cern.ch/dashboard/request.py/jobnumberscsv?sites=All%20T3210&datatiers=All%20DataTiers&applications=All%20Application%20Versions&submissions=All%20Submission%20Types&accesses=All%20Access%20Types&activities=${activity}&sitesSort=7&start=${date1}&end=${date2}&timeRange=daily&granularity=daily&generic=0&sortBy=0&series=All&type=r" - - curl -ks $url | dos2unix | sort -t \, -k 3 | awk -F\, ' - BEGIN{ - lastsite="None" - totaljobslots=0 - totaldays=0 - } - { - site=$3 - if ( site != lastsite && lastsite != "None" ){ - if ( totaldays > 0 ) { - slotsperday=int(totaljobslots/totaldays) - } else { - slotsperday=0 - } - printf("%s,%i\n",lastsite,slotsperday) - totaljobslots=0 - totaldays=0 - } - lastsite=site - totaljobslots+=$1 - totaldays+=1 - } - END{ - if ( totaldays > 0 ) { - slotsperday=int(totaljobslots/totaldays) - } else { - slotsperday=0 - } - printf("%s,%i\n",site,slotsperday) - }' - return 0 -} - -dashboard_user_report() { - GRANULARITY=$1 - NUMBER_OF_PERIODS=$2 - printf "%10s,%10s,%10s,%10s\n" date1 date2 nusers nuserst2 - date1=`date -dlast-monday +%F` - for (( i=1; i<=$NUMBER_OF_PERIODS; i++ )) ; do - date2=$date1 - date1=`date -d "$date2 -$GRANULARITY days" +%F` - nusers=` dashboard_users $date1 $date2 | awk '{print $1}'` - nuserst2=`dashboard_users $date1 $date2 2.0 | awk '{print $1}'` - printf "%10s,%10s,%10s,%10s\n" $date1 $date2 $nusers $nuserst2 - done - return -} - -dashboard_job_report() { - # ARGS: Granularity in days of the time period for the table - # ARGS: Number of time periods to display - GRANULARITY=$1 - NUMBER_OF_PERIODS=$2 - printf "%10s,%10s,%10s,%10s,%10s,%10s,%10s,%10s,%10s,%10s,%10s,%10s,%10s,%10s,%10s,%10s,%10s,%10s,%10s,%10s\n" \ - date1 date2 \ - ncrab2T1 ncrab3T1 nallT1 jscrab2T1 jscrab3T1 jsallT1 \ - ncrab2T2 ncrab3T2 nallT2 jscrab2T2 jscrab3T2 jsallT2 \ - ncrab2T3 ncrab3T3 nallT3 jscrab2T3 jscrab3T3 jsallT3 - - date1=`date -dlast-monday +%F` - for (( i=1; i<=$NUMBER_OF_PERIODS; i++ )) ; do - date2=$date1 - date1=`date -d "$date2 -$GRANULARITY days" +%F` - - jscrab2T1=`dashboard_job_slots_used $date1 $date2 analysis | awk -F, 'BEGIN{x=0}{if($1~/^T1/){x+=$2}}END{print x}'` - jscrab3T1=`dashboard_job_slots_used $date1 $date2 analysistest | awk -F, 'BEGIN{x=0}{if($1~/^T1/){x+=$2}}END{print x}'` - jsallT1=` dashboard_job_slots_used $date1 $date2 all | awk -F, 'BEGIN{x=0}{if($1~/^T1/){x+=$2}}END{print x}'` - - jscrab2T2=`dashboard_job_slots_used $date1 $date2 analysis | awk -F, 'BEGIN{x=0}{if($1~/^T2/){x+=$2}}END{print x}'` - jscrab3T2=`dashboard_job_slots_used $date1 $date2 analysistest | awk -F, 'BEGIN{x=0}{if($1~/^T2/){x+=$2}}END{print x}'` - jsallT2=` dashboard_job_slots_used $date1 $date2 all | awk -F, 'BEGIN{x=0}{if($1~/^T2/){x+=$2}}END{print x}'` - - jscrab2T3=`dashboard_job_slots_used $date1 $date2 analysis | awk -F, 'BEGIN{x=0}{if($1~/^T3/){x+=$2}}END{print x}'` - jscrab3T3=`dashboard_job_slots_used $date1 $date2 analysistest | awk -F, 'BEGIN{x=0}{if($1~/^T3/){x+=$2}}END{print x}'` - jsallT3=` dashboard_job_slots_used $date1 $date2 all | awk -F, 'BEGIN{x=0}{if($1~/^T3/){x+=$2}}END{print x}'` - - ncrab2T1=`dashboard_exit_status $date1 $date2 analysis T1 | awk -F, '{print $2}'` - ncrab3T1=`dashboard_exit_status $date1 $date2 analysistest T1 | awk -F, '{print $2}'` - nallT1=` dashboard_exit_status $date1 $date2 all T1 | awk -F, '{print $2}'` - - ncrab2T2=`dashboard_exit_status $date1 $date2 analysis T2 | awk -F, '{print $2}'` - ncrab3T2=`dashboard_exit_status $date1 $date2 analysistest T2 | awk -F, '{print $2}'` - nallT2=` dashboard_exit_status $date1 $date2 all T2 | awk -F, '{print $2}'` - - ncrab2T3=`dashboard_exit_status $date1 $date2 analysis T3 | awk -F, '{print $2}'` - ncrab3T3=`dashboard_exit_status $date1 $date2 analysistest T3 | awk -F, '{print $2}'` - nallT3=` dashboard_exit_status $date1 $date2 all T3 | awk -F, '{print $2}'` - - printf "%10s,%10s,%10s,%10s,%10s,%10s,%10s,%10s,%10s,%10s,%10s,%10s,%10s,%10s,%10s,%10s,%10s,%10s,%10s,%10s\n" \ - $date1 $date2 \ - $ncrab2T1 $ncrab3T1 $nallT1 $jscrab2T1 $jscrab3T1 $jsallT1 \ - $ncrab2T2 $ncrab3T2 $nallT2 $jscrab2T2 $jscrab3T2 $jsallT2 \ - $ncrab2T3 $ncrab3T3 $nallT3 $jscrab2T3 $jscrab3T3 $jsallT3 - - done - return -} diff --git a/dev/monitor.sh b/dev/monitor.sh deleted file mode 100755 index bd3f683..0000000 --- a/dev/monitor.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/sh - -source /home/letts/Monitor/glideinWMSMonitor/dev/bashrc - -run() { - - COLLECTOR1=$1 - COLLECTOR2=$2 - NAME=$3 - - export TMPDIR=$glideinWMSMonitor_OUTPUT_DIR/tmp_$$ - mkdir $TMPDIR - - OUTFILE=$glideinWMSMonitor_OUTPUT_DIR/monitor-summary-${NAME}-`/bin/date +%F-Z%R -u`.json - monitor_pool_json_summary_table $COLLECTOR1 $COLLECTOR2 $NAME > $OUTFILE - ln -sf $OUTFILE $glideinWMSMonitor_OUTPUT_DIR/latest-summary-${NAME}.json - - OUTFILE=$glideinWMSMonitor_OUTPUT_DIR/monitor-site-${NAME}-`/bin/date +%F-Z%R -u`.json - monitor_pool_json_site_table $COLLECTOR1 $COLLECTOR2 $NAME > $OUTFILE - ln -sf $OUTFILE $glideinWMSMonitor_OUTPUT_DIR/latest-site-${NAME}.json - - rmdir $TMPDIR - -} - -run vocms097.cern.ch vocms099.cern.ch global -run glidein-collector.t2.ucsd.edu glidein-collector-2.t2.ucsd.edu analysisops -#run vocms97.cern.ch cmssrv119.fnal.gov production - -exit diff --git a/dev/monitor_functions.sh b/dev/monitor_functions.sh deleted file mode 100755 index 50e932c..0000000 --- a/dev/monitor_functions.sh +++ /dev/null @@ -1,117 +0,0 @@ -#!/bin/bash - -monitor_pool_json_summary_table() { - # Usage: monitor_pool_json_summary_table collector1 collector2 title - # Callout: condor_status - - COLLECTOR1=$1 - COLLECTOR2=$2 - shift; shift - NAME=$@ - NOW=$[1000*`/bin/date +%s`] - - running1=`condor_status -schedd -total -pool $COLLECTOR1 | tail -1 | awk '{print int($2)}'` - running2=`condor_status -schedd -total -pool $COLLECTOR2 | tail -1 | awk '{print int($2)}'` - colldiff=`echo $running1 $running2 | awk '{print int($1-$2)}'` - queued1=`condor_status -schedd -total -pool $COLLECTOR1 | tail -1 | awk '{print int($3)}'` - queued2=`condor_status -schedd -total -pool $COLLECTOR2 | tail -1 | awk '{print int($3)}'` - negotime=`condor_status -pool $COLLECTOR1 -nego -l | grep LastNegotiationCycleDuration0 | awk '{print int($3)}'` - if [ -z $negotime ] ; then - negotime=`condor_status -pool $COLLECTOR2 -nego -l | grep LastNegotiationCycleDuration0 | awk '{print int($3)}'` - fi - - if [ -z $running1 ] ; then running1=null ; fi - if [ -z $running2 ] ; then running2=null ; fi - if [ -z $colldiff ] ; then colldiff=null ; fi - if [ -z $queued1 ] ; then queued1=null ; fi - if [ -z $queued2 ] ; then queued2=null ; fi - if [ -z $negotime ] ; then negotime=null ; fi - - cat < $SEDFILE - - echo "$SEDFILE" - return 0 -} - -get_pledges_from_sitedb() { - # output: comma separated list of CMSSite and latest CPU pledges - # in kHS06 divided by 10 to normalize roughly to cores. - # Federated pledges come at the end, so you need to take the - # last entry per site. Earlier (zero) entries may be from SitDB. - - # error if X509_USER_PROXY is not defined - - if [ -z $X509_USER_PROXY ] ; then - echo "ERROR: X509_USER_PROXY not defined!" - return 1 - fi - - # this url gives pledges by sitedb name like "ASGC" - - url="https://cmsweb.cern.ch/sitedb/data/prod/resource-pledges" - thisyear=`/bin/date +%Y` - TMPFILE=`mktemp -t TMPPLEDGES.txt.XXXXXXXXXX` || return 1 - - # get pledges from sitedb only for this year and translate - # to CMSSite name not the generic site name - - SEDFILE=`translate_site_names_from_sidedb_to_cmssite` - curl -ks --cert $X509_USER_PROXY --key $X509_USER_PROXY $url \ - | awk -F\, -v ty=$thisyear '($4==ty){print $2 "," $3 "," $5}' \ - | tr \[ \ | tr \" \ | sed 's/ //g' | sort | sed -f $SEDFILE | sort > $TMPFILE - - # Remove multiple pledges for the same site ($1) for this year - # by taking the most recently entered ($2). Approximate kHS06 - # to physical cpu by dividing by 10. - - PLEDGES=`mktemp -t PLEDGES.txt.XXXXXXXXXX` || return 2 - sites=`cat $TMPFILE | awk -F\, '{print $1}' | sort | uniq | grep ^T` - for site in $sites ; do - grep ^$site\, $TMPFILE | tail -1 | awk -F\, '{print $1 "," int($3*1000./10.) "," strftime("%F",$2)}' >> $PLEDGES - done - - # corrections for federation pledges. Always take the last one - get_federation_pledges >> $PLEDGES - - rm $TMPFILE $SEDFILE - echo "$PLEDGES" - return 0 -} - -translate_se_names_in_sitedb_to_cmssite() { - # output: name of sed file to translate SE names to CMSSite name - - SELIST=`mktemp -t SELIST.sed.XXXXXXXXXX` || exit 1 - SEDFILE=`translate_site_names_from_sidedb_to_cmssite` - - # from SiteDB get the list of SE, sitedb site name - # and translate sitedb site name to CMSSite name with $SEDFILE - # output as a sed file - - url="https://cmsweb.cern.ch/sitedb/data/prod/site-resources" - curl -ks --cert $X509_USER_PROXY --key $X509_USER_PROXY $url \ - | grep \"SE\" | awk -F\" '{print $2 "," $6}' | sed 's/ //g' \ - | sed -f $SEDFILE | sort | awk -F\, '{print "s/" $2 "/" $1 "/"}' > $SELIST - - rm $SEDFILE - echo "$SELIST" - return 0 -} diff --git a/multi-core.py b/multi-core.py index 0b0d82a..80e56d2 100755 --- a/multi-core.py +++ b/multi-core.py @@ -9,20 +9,20 @@ print "All,", print "Static,", print "Static Retiring,", -print "Partitionable,", -print "Partitionable Retiring,", +print "Dynamic+Partitionable,", +print "Dynamic+Partitionable Retiring,", print "All Idle,", print "Static Idle,", print "Static Retiring Idle,", -print "Partitionable Idle,", -print "Partitionable Retiring Idle,", +print "Dynamic+Partitionable Idle,", +print "Dynamic+Partitionable Retiring Idle,", print "All Idle %,", print "Static Idle %,", print "Static Retiring Idle %", -print "Partitionable Idle %,", -print "Partitionable Retiring Idle%" +print "Dynamic+Partitionable Idle %,", +print "Dynamic+Partitionable Retiring Idle%" FILES=glob.glob("/crabprod/CSstoragePath/Monitor-json/monitor-multicore-*.json") for FILE in FILES : diff --git a/select_overflow_from_history.sh b/select_overflow_from_history.sh index 3aae9ae..95a1939 100755 --- a/select_overflow_from_history.sh +++ b/select_overflow_from_history.sh @@ -32,6 +32,7 @@ grep MATCH_GLIDEIN $INPUTFILE | awk ' \ overflow="yes" for ( i in MATCH_GLIDEIN_CMSSite ) { for ( j in DESIRED_Sites ) { + print "DEBUG " MATCH_GLIDEIN_CMSSite[i] " " DESIRED_Sites[j] if ( MATCH_GLIDEIN_CMSSite[i]==DESIRED_Sites[j] ) { overflow="no" break @@ -41,6 +42,7 @@ grep MATCH_GLIDEIN $INPUTFILE | awk ' \ } # print out a new history file with only overflow jobs + print "DEBUG->" overflow if ( overflow=="yes" ) { print $0 } }'