diff --git a/job_scripts/hpss/process.xrb b/job_scripts/hpss/process.xrb new file mode 100755 index 0000000..4b5f3c5 --- /dev/null +++ b/job_scripts/hpss/process.xrb @@ -0,0 +1,261 @@ +#!/bin/ksh -p + +#---------------------------------------------------------------------------- +# user modifiable variables: + +# jobidfile is a lock file that is used to make sure that only one instance +# of this script is working on the current directory +jobidfile=process.jobid + + +# set the prefix of the plotfiles and checkpoint files +plt_prefix=*plt +chk_prefix=*chk + +# directory to archive to on HPSS -- set this to the working directory +work_dir=`pwd` +HPSS_DIR=`basename $work_dir` + +# set HTAR command +HTAR=htar + +# extra arguments to HTAR +# -P will create intermediate directories on HPSS (i.e. mkdir -p) +HTAR_ARGS=(-H copies=2 -P) + +# path to the ftime executable -- used for making a simple ftime.out file +# listing the name of the plotfile and its simulation time +FTIME_EXE=ftime.Linux.gfortran.exe + + +#---------------------------------------------------------------------------- +# initialization stuff + +# check to make sure that the lock file does not already exist. +if [ -f "$jobidfile" ]; then + # check if job is still running + existing_job=$(<"$jobidfile") + if [ "$(sacct -X -P -n -o State -j "$existing_job")" != RUNNING ]; then + echo "process: removing stale lock file for job $existing_job" + rm "$jobidfile" + else + echo "process job $existing_job is still running" + exit 2 + fi +fi + +# create the lock file +echo "$SLURM_JOB_ID" > "$jobidfile" + +# if our process is killed, remove the lock file first +function cleanup { + echo "process: received signal; removing $jobidfile" + command rm -f "$jobidfile" + # remove the EXIT handler, since we only want to do this once + trap - EXIT + # don't exit, so we can finish the current operation: + # $jobidfile is checked at the start of each loop iteration in process_files() +} +trap cleanup EXIT HUP INT QUIT TERM XCPU + +# Number of seconds to sleep before checking again. +N=60 + + +#---------------------------------------------------------------------------- +# make storage directories + +# once we process a file, we will move the plotfiles into the plotfiles/ +# directory. This then hides them from the script, so if the system +# later purges the files in the pltXXXXX directory and the .processed +# file, we don't overwrite our archived data with a tarred empty +# directory structure. We do the same with the checkpoint files (using +# checkfiles/) + +if [ ! -d plotfiles ]; then + mkdir plotfiles +fi + +if [ ! -d checkfiles ]; then + mkdir checkfiles +fi + + +#---------------------------------------------------------------------------- +# the processing function + +# Process Files. Once a plotfile is successfully processed, we will output +# a file pltXXXXX.processed (checkpoint files are only archived, with a +# chkXXXXX.processed file appearing once the archiving is successful). +# Subsequent invocations of this routine will skip over any plotfiles or +# checkpoint files that have a corresponding .processed file. + + +function process_files +{ + if [ ! -f $jobidfile ]; then + echo "process: $jobidfile has been removed, exiting" + exit + fi + + + # plotfiles + + # Take all but the final plt file -- we want to ensure they're completely + # written to disk. Strip out any tar files that are lying around as well + # as pltXXXXX.processed files. We restrict the find command to a depth of + # 1 to avoid catching any already-processed files in the plotfiles/ + # directory + pltlist=($( + find . -maxdepth 1 -type d -name "${plt_prefix}?????" -print | sort + find . -maxdepth 1 -type d -name "${plt_prefix}??????" -print | sort + find . -maxdepth 1 -type d -name "${plt_prefix}???????" -print | sort + )) + + # Don't process the final plt file + unset "pltlist[-1]" + + for dir in "${pltlist[@]}" + do + if [ ! -f $jobidfile ]; then + echo "process: $jobidfile has been removed, exiting" + exit + fi + if [ -d ${dir} ]; then + + # only work on the file if there is not a .processed file in the + # main directory or the plotfiles/ directory + if [ ! -f ${dir}.processed ] && [ ! -f plotfiles/${dir}.processed ]; then + + # do processing + + # store the file on HPSS + ${HTAR} "${HTAR_ARGS[@]}" -cvf ${HPSS_DIR}/${dir}.tar ${dir} > ${dir}.htar + + # Ordinarily, we'd check htar's exit status (0 = successful), but + # on some machines (like Atlas) htar doesn't return a valid exit + # status. Instead we'll grep for the success line at the end of + # htar's output (which we piped into a file) and check the output + # status of grep + grep "HTAR: HTAR SUCCESSFUL" ${dir}.htar >> /dev/null + + # The variable $? holds the exit status of the previous command + if [ $? -eq 0 ]; then + + # mark this file as processed so we skip it next time + date > ${dir}.processed + + # output the plotfile name and simulation time to ftime.out + if [ `command -v ${FTIME_EXE}` ] ; then + ${FTIME_EXE} ${dir} >> ftime.out + fi + + # remove the htar temporary file + rm ${dir}.htar + + # move the plotfile into the plotfiles directory + mv ${dir} plotfiles/ + + # ..and the corresponding .processed file too. + mv ${dir}.processed plotfiles/ + + # and visualize it + #runtimevis.py plotfiles/${dir} + + fi + + fi # end test of whether plotfile already processed + + fi # end test of whether plotfile is a directory (as it should be) + + done + + + # checkpoint files + + # Take all but the final chk file -- we want to ensure they're completely + # written to disk. Strip out any tar files that are lying around as well + # as chkXXXXX.processed files. We restrict the find command to a depth of + # 1 to avoid catching any already-processed files in the checkfiles/ + # directory + chklist=($( + find . -maxdepth 1 -type d -name "${chk_prefix}?[05]000" -print | sort + find . -maxdepth 1 -type d -name "${chk_prefix}??[05]000" -print | sort + find . -maxdepth 1 -type d -name "${chk_prefix}???[05]000" -print | sort + )) + + # Don't process the final chk file + unset "chklist[-1]" + + for dir in "${chklist[@]}" + do + if [ ! -f $jobidfile ]; then + echo "process: $jobidfile has been removed, exiting" + exit + fi + if [ -d ${dir} ]; then + + if [ ! -f ${dir}.processed ] && [ ! -f checkfiles/${dir}.processed ]; then + + # store the file on HPSS + ${HTAR} "${HTAR_ARGS[@]}" -cvf ${HPSS_DIR}/${dir}.tar ${dir} > ${dir}.htar + + # Ordinarily, we'd check htar's exit status (0 = successful), but + # on some machines (like Atlas) htar doesn't return a valid exit + # status. Instead we'll grep for the success line at the end of + # htar's output (which we piped into a file) and check the output + # status of grep + grep "HTAR: HTAR SUCCESSFUL" ${dir}.htar >> /dev/null + + # The variable $? holds the exit status of the previous command + if [ $? -eq 0 ]; then + + # mark this file as processed so we skip it next time + date > ${dir}.processed + + # remove the htar temporary file + rm ${dir}.htar + + # move the checkpoint file into the checkfiles directory + mv ${dir} checkfiles/ + + # ..and the corresponding .processed file too. + mv ${dir}.processed checkfiles/ + + fi + + fi + + fi + done + +} + + +#---------------------------------------------------------------------------- +# the main function + +# archive any diagnostic files first -- give them a unique name, appending +# the date string, to make sure that we don't overwrite anything +datestr=$(date +"%Y%m%d_%H%M_%S") +all_files=($( + find . -maxdepth 1 -name "ftime.out" -print + find . -maxdepth 1 -name "inputs*" -print + find . -maxdepth 1 -name "*diag.out" -print + find . -maxdepth 1 -name "*.hse.*" -print + find . -maxdepth 1 -name "*.slurm" -print; find . -maxdepth 1 -name "*.submit" -print + find . -maxdepth 1 -name "process*" -print +)) + +${HTAR} -P -cvf ${HPSS_DIR}/diag_files_${datestr}.tar "${all_files[@]}" >> /dev/null + + +# Loop, waiting for plt and chk directories to appear. + +while true +do + process_files + # put sleep in the background so the shell can handle signals + sleep $N & + wait +done diff --git a/job_scripts/perlmutter/nersc.xfer.slurm b/job_scripts/perlmutter/nersc.xfer.slurm index 5e2879f..41af123 100644 --- a/job_scripts/perlmutter/nersc.xfer.slurm +++ b/job_scripts/perlmutter/nersc.xfer.slurm @@ -1,17 +1,11 @@ -#!/bin/ksh +#!/bin/bash #SBATCH --qos=xfer #SBATCH -J xrb-hpss-xfer #SBATCH -t 12:00:00 #SBATCH --licenses=SCRATCH -cd $SLURM_SUBMIT_DIR - # do our archiving -pidfile=process.pid - -./process.xrb - -PID=$! -trap 'kill -s TERM $PID' EXIT TERM HUP XCPU KILL +cd "$SLURM_SUBMIT_DIR" || exit -rm -f process.pid +# use srun so any control signals get sent to the child too +srun ./process.xrb diff --git a/job_scripts/perlmutter/process.xrb b/job_scripts/perlmutter/process.xrb deleted file mode 100755 index 25972d9..0000000 --- a/job_scripts/perlmutter/process.xrb +++ /dev/null @@ -1,246 +0,0 @@ -#!/bin/ksh -p - -#---------------------------------------------------------------------------- -# user modifiable variables: - -# pidfile is a lock file that is used to make sure that only one instance -# of this script is working on the current directory -pidfile=process.pid - - -# set the prefix of the plotfiles and checkpoint files -plt_prefix=*plt -chk_prefix=*chk - -# directory to archive to on HPSS -- set this to the working directory -work_dir=`pwd` -HPSS_DIR=`basename $work_dir` - -# set HTAR command -HTAR=htar - -# path to the ftime executable -- used for making a simple ftime.out file -# listing the name of the plotfile and its simulation time -FTIME_EXE=ftime.Linux.gfortran.exe - - -#---------------------------------------------------------------------------- -# initialization stuff - -# check to make sure that the lock file does not already exist. -if [ -f $pidfile ]; then - echo 2>&1 "process lock file " $pidfile " already exists" - exit -1 -fi - -# create the lock file -echo $$ > $pidfile - -# if our process if killed, remove the lock file first -trap '/bin/rm -f $pidfile' EXIT HUP TERM XCPU KILL - -# Number of seconds to sleep before checking again. -N=60 - - -#---------------------------------------------------------------------------- -# make storage directories - -# once we process a file, we will move the plotfiles into the plotfiles/ -# directory. This then hides them from the script, so if the system -# later purges the files in the pltXXXXX directory and the .processed -# file, we don't overwrite our archived data with a tarred empty -# directory structure. We do the same with the checkpoint files (using -# checkfiles/) - -if [ ! -d plotfiles ]; then - mkdir plotfiles -fi - -if [ ! -d checkfiles ]; then - mkdir checkfiles -fi - - -#---------------------------------------------------------------------------- -# the processing function - -# Process Files. Once a plotfile is successfully processed, we will output -# a file pltXXXXX.processed (checkpoint files are only archived, with a -# chkXXXXX.processed file appearing once the archiving is successful). -# Subsequent invocations of this routine will skip over any plotfiles or -# checkpoint files that have a corresponding .processed file. - - -function process_files -{ - if [ ! -f $pidfile ]; then - echo "process: $pidfile has been removed, exiting" - exit - fi - - - # plotfiles - - # Take all but the final plt file -- we want to ensure they're completely - # written to disk. Strip out any tar files that are lying around as well - # as pltXXXXX.processed files. We restrict the find command to a depth of - # 1 to avoid catching any already-processed files in the plotfiles/ - # directory - pltlist5=$(find . -maxdepth 1 -type d -name "${plt_prefix}?????" -print | sort) - pltlist6=$(find . -maxdepth 1 -type d -name "${plt_prefix}??????" -print | sort) - pltlist7=$(find . -maxdepth 1 -type d -name "${plt_prefix}???????" -print | sort) - - pltlist="$pltlist5 $pltlist6 $pltlist7" - - if [ "$pltlist" ]; then - nl=$(echo "$pltlist" | wc -l) - nl=$(expr $nl - 1) - if [ $nl -eq 0 ]; then - pltlist="" - else - pltlist=$(echo "$pltlist" | head -$nl) - fi - fi - - - for dir in ${pltlist} - do - if [ -d ${dir} ]; then - - # only work on the file if there is not a .processed file in the - # main directory or the plotfiles/ directory - if [ ! -f ${dir}.processed ] && [ ! -f plotfiles/${dir}.processed ]; then - - # do processing - - # store the file on HPSS - ${HTAR} -H copies=2 -cvf ${HPSS_DIR}/${dir}.tar ${dir} > ${dir}.htar - - # Ordinarily, we'd check htar's exit status (0 = successful), but - # on some machines (like Atlas) htar doesn't return a valid exit - # status. Instead we'll grep for the success line at the end of - # htar's output (which we piped into a file) and check the output - # status of grep - grep "HTAR: HTAR SUCCESSFUL" ${dir}.htar >> /dev/null - - # The variable $? holds the exit status of the previous command - if [ $? -eq 0 ]; then - - # mark this file as processed so we skip it next time - date > ${dir}.processed - - # output the plotfile name and simulation time to ftime.out - if [ `command -v ${FTIME_EXE}` ] ; then - ${FTIME_EXE} ${dir} >> ftime.out - fi - - # remove the htar temporary file - rm ${dir}.htar - - # move the plotfile into the plotfiles directory - mv ${dir} plotfiles/ - - # ..and the corresponding .processed file too. - mv ${dir}.processed plotfiles/ - - # and visualize it - #runtimevis.py plotfiles/${dir} - - fi - - fi # end test of whether plotfile already processed - - fi # end test of whether plotfile is a directory (as it should be) - - done - - - # checkpoint files - - # Take all but the final chk file -- we want to ensure they're completely - # written to disk. Strip out any tar files that are lying around as well - # as chkXXXXX.processed files. We restrict the find command to a depth of - # 1 to avoid catching any already-processed files in the checkfiles/ - # directory - chklist5=$(find . -maxdepth 1 -type d -name "${chk_prefix}?[05]000" -print | sort) - chklist6=$(find . -maxdepth 1 -type d -name "${chk_prefix}??[05]000" -print | sort) - chklist7=$(find . -maxdepth 1 -type d -name "${chk_prefix}???[05]000" -print | sort) - - chklist="$chklist5 $chklist6 $chklist7" - - if [ "$chklist" ]; then - nl=$(echo "$chklist" | wc -l) - nl=$(expr $nl - 1) - if [ $nl -eq 0 ]; then - chklist="" - else - chklist=$(echo "$chklist" | head -$nl) - fi - fi - - - for dir in ${chklist} - do - if [ -d ${dir} ]; then - - if [ ! -f ${dir}.processed ] && [ ! -f checkfiles/${dir}.processed ]; then - - # store the file on HPSS - ${HTAR} -H copies=2 -cvf ${HPSS_DIR}/${dir}.tar ${dir} > ${dir}.htar - - # Ordinarily, we'd check htar's exit status (0 = successful), but - # on some machines (like Atlas) htar doesn't return a valid exit - # status. Instead we'll grep for the success line at the end of - # htar's output (which we piped into a file) and check the output - # status of grep - grep "HTAR: HTAR SUCCESSFUL" ${dir}.htar >> /dev/null - - # The variable $? holds the exit status of the previous command - if [ $? -eq 0 ]; then - - # mark this file as processed so we skip it next time - date > ${dir}.processed - - # remove the htar temporary file - rm ${dir}.htar - - # move the checkpoint file into the checkfiles directory - mv ${dir} checkfiles/ - - # ..and the corresponding .processed file too. - mv ${dir}.processed checkfiles/ - - fi - - fi - - fi - done - -} - - -#---------------------------------------------------------------------------- -# the main function - -# archive any diagnostic files first -- give them a unique name, appending -# the date string, to make sure that we don't overwrite anything -datestr=$(date +"%Y%m%d_%H%M_%S") -ftime_files=$(find . -maxdepth 1 -name "ftime.out" -print) -inputs_files=$(find . -maxdepth 1 -name "inputs*" -print) -diag_files=$(find . -maxdepth 1 -name "*diag.out" -print) -model_files=$(find . -maxdepth 1 -name "*.hse.*" -print) -job_files=$(find . -maxdepth 1 -name "*.slurm" -print) $(find . -maxdepth 1 -name "*.submit" -print) -process_files=$(find . -maxdepth 1 -name "process*" -print) - -${HTAR} -cvf ${HPSS_DIR}/diag_files_${datestr}.tar ${model_files} ${ftime_files} ${inputs_files} ${probin_files} ${job_files} ${process_files} >> /dev/null - - -# Loop, waiting for plt and chk directories to appear. - -while true -do - process_files - sleep $N -done diff --git a/job_scripts/summit/process.xrb b/job_scripts/summit/process.xrb deleted file mode 100755 index 25972d9..0000000 --- a/job_scripts/summit/process.xrb +++ /dev/null @@ -1,246 +0,0 @@ -#!/bin/ksh -p - -#---------------------------------------------------------------------------- -# user modifiable variables: - -# pidfile is a lock file that is used to make sure that only one instance -# of this script is working on the current directory -pidfile=process.pid - - -# set the prefix of the plotfiles and checkpoint files -plt_prefix=*plt -chk_prefix=*chk - -# directory to archive to on HPSS -- set this to the working directory -work_dir=`pwd` -HPSS_DIR=`basename $work_dir` - -# set HTAR command -HTAR=htar - -# path to the ftime executable -- used for making a simple ftime.out file -# listing the name of the plotfile and its simulation time -FTIME_EXE=ftime.Linux.gfortran.exe - - -#---------------------------------------------------------------------------- -# initialization stuff - -# check to make sure that the lock file does not already exist. -if [ -f $pidfile ]; then - echo 2>&1 "process lock file " $pidfile " already exists" - exit -1 -fi - -# create the lock file -echo $$ > $pidfile - -# if our process if killed, remove the lock file first -trap '/bin/rm -f $pidfile' EXIT HUP TERM XCPU KILL - -# Number of seconds to sleep before checking again. -N=60 - - -#---------------------------------------------------------------------------- -# make storage directories - -# once we process a file, we will move the plotfiles into the plotfiles/ -# directory. This then hides them from the script, so if the system -# later purges the files in the pltXXXXX directory and the .processed -# file, we don't overwrite our archived data with a tarred empty -# directory structure. We do the same with the checkpoint files (using -# checkfiles/) - -if [ ! -d plotfiles ]; then - mkdir plotfiles -fi - -if [ ! -d checkfiles ]; then - mkdir checkfiles -fi - - -#---------------------------------------------------------------------------- -# the processing function - -# Process Files. Once a plotfile is successfully processed, we will output -# a file pltXXXXX.processed (checkpoint files are only archived, with a -# chkXXXXX.processed file appearing once the archiving is successful). -# Subsequent invocations of this routine will skip over any plotfiles or -# checkpoint files that have a corresponding .processed file. - - -function process_files -{ - if [ ! -f $pidfile ]; then - echo "process: $pidfile has been removed, exiting" - exit - fi - - - # plotfiles - - # Take all but the final plt file -- we want to ensure they're completely - # written to disk. Strip out any tar files that are lying around as well - # as pltXXXXX.processed files. We restrict the find command to a depth of - # 1 to avoid catching any already-processed files in the plotfiles/ - # directory - pltlist5=$(find . -maxdepth 1 -type d -name "${plt_prefix}?????" -print | sort) - pltlist6=$(find . -maxdepth 1 -type d -name "${plt_prefix}??????" -print | sort) - pltlist7=$(find . -maxdepth 1 -type d -name "${plt_prefix}???????" -print | sort) - - pltlist="$pltlist5 $pltlist6 $pltlist7" - - if [ "$pltlist" ]; then - nl=$(echo "$pltlist" | wc -l) - nl=$(expr $nl - 1) - if [ $nl -eq 0 ]; then - pltlist="" - else - pltlist=$(echo "$pltlist" | head -$nl) - fi - fi - - - for dir in ${pltlist} - do - if [ -d ${dir} ]; then - - # only work on the file if there is not a .processed file in the - # main directory or the plotfiles/ directory - if [ ! -f ${dir}.processed ] && [ ! -f plotfiles/${dir}.processed ]; then - - # do processing - - # store the file on HPSS - ${HTAR} -H copies=2 -cvf ${HPSS_DIR}/${dir}.tar ${dir} > ${dir}.htar - - # Ordinarily, we'd check htar's exit status (0 = successful), but - # on some machines (like Atlas) htar doesn't return a valid exit - # status. Instead we'll grep for the success line at the end of - # htar's output (which we piped into a file) and check the output - # status of grep - grep "HTAR: HTAR SUCCESSFUL" ${dir}.htar >> /dev/null - - # The variable $? holds the exit status of the previous command - if [ $? -eq 0 ]; then - - # mark this file as processed so we skip it next time - date > ${dir}.processed - - # output the plotfile name and simulation time to ftime.out - if [ `command -v ${FTIME_EXE}` ] ; then - ${FTIME_EXE} ${dir} >> ftime.out - fi - - # remove the htar temporary file - rm ${dir}.htar - - # move the plotfile into the plotfiles directory - mv ${dir} plotfiles/ - - # ..and the corresponding .processed file too. - mv ${dir}.processed plotfiles/ - - # and visualize it - #runtimevis.py plotfiles/${dir} - - fi - - fi # end test of whether plotfile already processed - - fi # end test of whether plotfile is a directory (as it should be) - - done - - - # checkpoint files - - # Take all but the final chk file -- we want to ensure they're completely - # written to disk. Strip out any tar files that are lying around as well - # as chkXXXXX.processed files. We restrict the find command to a depth of - # 1 to avoid catching any already-processed files in the checkfiles/ - # directory - chklist5=$(find . -maxdepth 1 -type d -name "${chk_prefix}?[05]000" -print | sort) - chklist6=$(find . -maxdepth 1 -type d -name "${chk_prefix}??[05]000" -print | sort) - chklist7=$(find . -maxdepth 1 -type d -name "${chk_prefix}???[05]000" -print | sort) - - chklist="$chklist5 $chklist6 $chklist7" - - if [ "$chklist" ]; then - nl=$(echo "$chklist" | wc -l) - nl=$(expr $nl - 1) - if [ $nl -eq 0 ]; then - chklist="" - else - chklist=$(echo "$chklist" | head -$nl) - fi - fi - - - for dir in ${chklist} - do - if [ -d ${dir} ]; then - - if [ ! -f ${dir}.processed ] && [ ! -f checkfiles/${dir}.processed ]; then - - # store the file on HPSS - ${HTAR} -H copies=2 -cvf ${HPSS_DIR}/${dir}.tar ${dir} > ${dir}.htar - - # Ordinarily, we'd check htar's exit status (0 = successful), but - # on some machines (like Atlas) htar doesn't return a valid exit - # status. Instead we'll grep for the success line at the end of - # htar's output (which we piped into a file) and check the output - # status of grep - grep "HTAR: HTAR SUCCESSFUL" ${dir}.htar >> /dev/null - - # The variable $? holds the exit status of the previous command - if [ $? -eq 0 ]; then - - # mark this file as processed so we skip it next time - date > ${dir}.processed - - # remove the htar temporary file - rm ${dir}.htar - - # move the checkpoint file into the checkfiles directory - mv ${dir} checkfiles/ - - # ..and the corresponding .processed file too. - mv ${dir}.processed checkfiles/ - - fi - - fi - - fi - done - -} - - -#---------------------------------------------------------------------------- -# the main function - -# archive any diagnostic files first -- give them a unique name, appending -# the date string, to make sure that we don't overwrite anything -datestr=$(date +"%Y%m%d_%H%M_%S") -ftime_files=$(find . -maxdepth 1 -name "ftime.out" -print) -inputs_files=$(find . -maxdepth 1 -name "inputs*" -print) -diag_files=$(find . -maxdepth 1 -name "*diag.out" -print) -model_files=$(find . -maxdepth 1 -name "*.hse.*" -print) -job_files=$(find . -maxdepth 1 -name "*.slurm" -print) $(find . -maxdepth 1 -name "*.submit" -print) -process_files=$(find . -maxdepth 1 -name "process*" -print) - -${HTAR} -cvf ${HPSS_DIR}/diag_files_${datestr}.tar ${model_files} ${ftime_files} ${inputs_files} ${probin_files} ${job_files} ${process_files} >> /dev/null - - -# Loop, waiting for plt and chk directories to appear. - -while true -do - process_files - sleep $N -done diff --git a/job_scripts/summit/summit_hpss.submit b/job_scripts/summit/summit_hpss.submit index 8e366ef..d212aa8 100644 --- a/job_scripts/summit/summit_hpss.submit +++ b/job_scripts/summit/summit_hpss.submit @@ -5,13 +5,7 @@ #SBATCH -N 1 # do our archiving -pidfile=process.pid +cd "$SLURM_SUBMIT_DIR" || exit -cd $SLURM_SUBMIT_DIR - -./process.xrb - -PID=$! -trap 'kill -s TERM $PID' EXIT TERM HUP XCPU KILL - -rm -f process.pid +# use srun so any control signals get sent to the child too +srun ./process.xrb diff --git a/sphinx_docs/source/nersc-hpss.rst b/sphinx_docs/source/nersc-hpss.rst index 4be1df7..2784a67 100644 --- a/sphinx_docs/source/nersc-hpss.rst +++ b/sphinx_docs/source/nersc-hpss.rst @@ -9,41 +9,27 @@ frequently, since the scratch filesystems fill up and NERSC will purge data periodically. -The script ``nersc.xfer.slurm``: +The script ``nersc.xfer.slurm`` in ``job_scripts/perlmutter/``: :download:`nersc.xfer.slurm <../../job_scripts/perlmutter/nersc.xfer.slurm>` can be used to archive data to HPSS automatically. This is submitted to the xfer queue and runs the -script ``process.xrb``: +script ``process.xrb`` in ``job_scripts/hpss/``: -:download:`process.xrb <../../job_scripts/perlmutter/process.xrb>` +:download:`process.xrb <../../job_scripts/hpss/process.xrb>` -which continually looks for output and stores -it to HPSS. +which continually looks for output and stores it to HPSS. +By default, the destination directory on HPSS will be have the same name +as the directory your plotfiles are located in. This can be changed by +editing the``$HPSS_DIR`` variable at the top of ``process.xrb``. The following describes how to use the scripts: -1. Create a directory in HPSS that has the same - name as the directory your plotfiles are located in - (just the directory name, not the full path). e.g. if you are running in a directory call - ``/pscratch/sd/z/zingale/wdconvect/`` run, then do: - - .. prompt:: bash - - hsi - mkdir wdconvect - - .. note:: - - If the ``hsi`` command prompts you for your password, you will need - to talk to the NERSC help desk to ask for password-less access to - HPSS. - -2. Copy the ``process.xrb`` script and the slurm script ``nersc.xfer.slurm`` +#. Copy the ``process.xrb`` script and the slurm script ``nersc.xfer.slurm`` into the directory with the plotfiles. -3. Submit the archive job: +#. Submit the archive job: .. prompt:: bash @@ -80,14 +66,16 @@ Some additional notes: the date-string to allow multiple archives to co-exist. * When ``process.xrb`` is running, it creates a lockfile (called - ``process.pid``) that ensures that only one instance of the script + ``process.jobid``) that ensures that only one instance of the script is running at any one time. .. warning:: Sometimes if the job is not terminated normally, the - ``process.pid`` file will be left behind, in which case, the script - aborts. Just delete that if you know the script is not running. + ``process.jobid`` file will be left behind. Later jobs should be + able to detect this and clean up the stale lockfile, but if this + doesn't work, you can delete the file if you know the script is not + running. Jobs in the xfer queue start up quickly. The best approach is to start one as you start your main job (or make it dependent on the main diff --git a/sphinx_docs/source/olcf-workflow.rst b/sphinx_docs/source/olcf-workflow.rst index 3b78ff6..ab114ab 100644 --- a/sphinx_docs/source/olcf-workflow.rst +++ b/sphinx_docs/source/olcf-workflow.rst @@ -383,27 +383,26 @@ where ``test_hpss.sh`` is a SLURM script that contains the ``htar`` commands needed to archive your data. This uses ``slurm`` as the job manager. -An example is provided by the ``process.xrb`` archiving script and -associated ``summit_hpss.submit`` submission script in -``jobs_scripts/summit/``. Together these will detect new plotfiles as -they are generated, tar them up (using ``htar``) and archive them onto -HPSS. They will also store the inputs, probin, and other runtime -generated files. If ``ftime`` is found in your path, it will also -create a file called ``ftime.out`` that lists the simulation time -corresponding to each plotfile. +An example is provided by the ``process.xrb`` archiving script in +``job_scripts/hpss/`` and associated ``summit_hpss.submit`` submission script +in ``jobs_scripts/summit/``. Together these will detect new plotfiles as they +are generated, tar them up (using ``htar``) and archive them onto HPSS. They +will also store the inputs, probin, and other runtime generated files. If +``ftime`` is found in your path, it will also create a file called +``ftime.out`` that lists the simulation time corresponding to each plotfile. Once the plotfiles are archived they are moved to a subdirectory under your run directory called ``plotfiles/``. +By default, the files will be archived to a directory in HPSS with the same +name as the directory your plotfiles are located in. This can be changed +by editing the ``$HPSS_DIR`` variable at the top of ``process.xrb``. -To use this, we do the following: - -#. Enter the HPSS system via ``hsi`` -#. Create the output directory -- this should have the same name as the directory - you are running in on summit +To use this, we do the following: -#. Exit HPSS +#. Copy the ``process.xrb`` and ``summit_hpss.submit`` scripts into the + directory with the plotfiles. #. Launch the script via: @@ -411,7 +410,7 @@ To use this, we do the following: sbatch summit_hpss.submit - It will for the full time you asked, searching for plotfiles as + It will run for the full time you asked, searching for plotfiles as they are created and moving them to HPSS as they are produced (it will always leave the very last plotfile alone, since it can't tell if it is still being written).