Skip to content

Commit

Permalink
Merge pull request #19 from yut23/slurm_graceful_exit
Browse files Browse the repository at this point in the history
Add graceful exit handling on Perlmutter
  • Loading branch information
zingale authored Oct 13, 2023
2 parents 9da3037 + 1b6e657 commit fc64c98
Showing 1 changed file with 28 additions and 1 deletion.
29 changes: 28 additions & 1 deletion job_scripts/perlmutter/perlmutter.submit
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#SBATCH --ntasks-per-node=4
#SBATCH --gpus-per-task=1
#SBATCH --gpu-bind=map_gpu:0,1,2,3
#SBATCH --signal=B:URG@2

export CASTRO_EXEC=./Castro2d.gnu.MPI.CUDA.SMPLSDC.ex
export INPUTS=inputs_2d.N14
Expand Down Expand Up @@ -56,8 +57,34 @@ else
restartString="amr.restart=${restartFile}"
fi

# clean up any run management files left over from previous runs
rm -f dump_and_stop

# The `--signal=B:URG@<n>` option tells slurm to send SIGURG to this batch
# script n minutes before the runtime limit, so we can exit gracefully.
function sig_handler {
touch dump_and_stop
# disable this signal handler
trap - URG
echo "BATCH: allocation ending soon; telling Castro to dump a checkpoint and stop"
}
trap sig_handler URG

workdir=`basename ${SLURM_SUBMIT_DIR}`
slack_job_start.py "starting NERSC job: ${workdir} ${restartFile}" @michael


srun -n 64 ${CASTRO_EXEC} ${INPUTS} ${restartString}
# execute srun in the background then use the builtin wait so the shell can
# handle the signal
srun -n 64 ${CASTRO_EXEC} ${INPUTS} ${restartString} &
pid=$!
wait $pid
ret=$?

if (( ret == 128 + 23 )); then
# received SIGURG, keep waiting
wait $pid
ret=$?
fi

exit $ret

0 comments on commit fc64c98

Please sign in to comment.