Skip to content

Commit

Permalink
Issue #5: detect when slurm kills jobs because they exceeded resource…
Browse files Browse the repository at this point in the history
… allocations

add --cluster-status $0/slurm-status.py

Modified slurm-status.py to use argv[4] instead of argv[1]
as they had in https://github.com/Snakemake-Profiles/slurm

Add slurm-status.py to the install script, and have snakemakeslurm check for it.
Issue a warning if it's missing.
  • Loading branch information
rusalkaguy committed May 12, 2020
1 parent 911fa12 commit 20c4983
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 1 deletion.
2 changes: 2 additions & 0 deletions install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,12 @@ fi
echo "# install exec and config"
rsync -hav --info=name2,stats0,flist0 $FLAGS \
./snakemakeslurm \
./slurm-status.py \
./cluster.slurm.cheaha.json \
$1

echo "# install modulefiles"
rsync -hav --info=name2,stats0,flist0 $FLAGS \
--exclude="*~" \
./modulefiles/snakemakeslurm \
$1/../../modulefiles
43 changes: 43 additions & 0 deletions modulefiles/snakemakeslurm/5.2.4-3
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#%Module1.0#####################################################################
##
## snakemakeslurm
##
## https://github.com/rusalkaguy/snakemake-slurm-module
##
################################################################################
set components [ file split [ module-info name ] ]
set prog [ lindex $components 0 ]
set snakemake-version [ lindex [ split [ lindex $components 1 ] - ] 0 ]
set script_version [ lindex [ split [ lindex $components 1 ] - ] 1 ]
set version ${snakemake-version}-${script_version}
set modroot /share/apps/ngs-ccts/${prog}/${prog}-${version}
set url https://github.com/rusalkaguy/snakemake-slurm-module

proc ModulesHelp { } {
global version modroot
puts stderr "\tsnakemakeslurm - have snakemake submit jobs via slurm to cheaha"
puts stderr "\n\tComponents $components\n"
puts stderr "\n\tVersion $version\n"
puts stderr "\n\tmodroot $modroot\n"
}
module-whatis "Loads snakemakeslurm environment."

# load required modules
# make sure to load dvctools first; if loaded second, it loads an older GCC.
module load dvctools
# UAB-HPC old module naming convention
# #module load rc/snakemake/${snakemake-version}
module load snakemake/${snakemake-version}-foss-2018b-Python-3.6.6
# snakemake 4.8.0 doesn't work with Anaconda3/5.3.0
# snakemake 5.2.4 DOES work with Anaconda3/5.3.1 -
# update Anaconda3 to 2020.02 to work with RNA-Seq pipeline
module load Anaconda3/2020.02
# only one version at a time
#conflict multiqc

#
#
# Make the directories available
prepend-path PATH $modroot/
#prepend-path MANPATH $modroot/share/man
#prepend-path PYTHONPATH odroot/lib/python2.7/site-packages
4 changes: 3 additions & 1 deletion slurm-status.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@

STATUS_ATTEMPTS = 20

jobid = sys.argv[1]
#jobid = sys.argv[1] # original from https://github.com/Snakemake-Profiles/slurm
# Locally, our snakemake --cluster-status passes the args ["Submitted", "batch", "job", JOBID]!
jobid = sys.argv[4]


for i in range(STATUS_ATTEMPTS):
Expand Down
18 changes: 18 additions & 0 deletions snakemakeslurm
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,23 @@ fi
# list in order, later files override earlier ones
#

# global config lives with this script

#
# check if we have a script to get "cause of death" from SLURM
# w/o this, if SLURM kills a job for over-time or over-mem, snakemake wont know
#
# See: https://snakemake.readthedocs.io/en/stable/tutorial/additional_features.html?highlight=--cluster-status%20#using-cluster-status
#
SLURM_STATUS_FLAG=
SLURM_STATUS_SCRIPT=$(dirname $0)/slurm-status.py
if [ -e "$SLURM_STATUS_SCRIPT" ]; then
if [ "$DEBUG" == "1" ]; then echo "# found $SLURM_STATUS_SCRIPT"; fi;
SLURM_STATUS_FLAG=" --cluster-status $SLURM_STATUS_SCRIPT"
else
echo "# WARNING: missing $SLURM_STATUS_SCRIPT "
fi

# global config lives with this script
CCONFIGS="--cluster-config $(dirname $0)/cluster.slurm.cheaha.json"

Expand Down Expand Up @@ -130,5 +147,6 @@ $EXEC snakemake \
--latency-wait 45 \
--jobs 999 \
$CCONFIGS \
$SLURM_STATUS_FLAG \
--cluster "sbatch $SM_ARGS" \
$*

0 comments on commit 20c4983

Please sign in to comment.