deploy: 11c6ff4

AMReX-Astro · Feb 1, 2024 · e9a777b · e9a777b
1 parent 82240e0
commit e9a777b
Show file tree

Hide file tree

Showing 7 changed files with 323 additions and 90 deletions.
diff --git a/_downloads/05203ba4d0d4aa143482b601abe7c8b1/nersc.xfer.slurm b/_downloads/05203ba4d0d4aa143482b601abe7c8b1/nersc.xfer.slurm
@@ -1,17 +1,11 @@
-#!/bin/ksh
+#!/bin/bash
 #SBATCH --qos=xfer
 #SBATCH -J xrb-hpss-xfer
 #SBATCH -t 12:00:00
 #SBATCH --licenses=SCRATCH
 
-cd $SLURM_SUBMIT_DIR
-
 # do our archiving
-pidfile=process.pid
-
-./process.xrb
-
-PID=$!
-trap 'kill -s TERM $PID' EXIT TERM HUP XCPU KILL
+cd "$SLURM_SUBMIT_DIR" || exit
 
-rm -f process.pid
+# use srun so any control signals get sent to the child too
+srun ./process.xrb
diff --git a/_downloads/cd10ec9dd07537d7054ccb73c28380f4/process.xrb b/_downloads/cd10ec9dd07537d7054ccb73c28380f4/process.xrb
@@ -0,0 +1,261 @@
+#!/bin/ksh -p
+
+#----------------------------------------------------------------------------
+# user modifiable variables:
+
+# jobidfile is a lock file that is used to make sure that only one instance
+# of this script is working on the current directory
+jobidfile=process.jobid
+
+
+# set the prefix of the plotfiles and checkpoint files
+plt_prefix=*plt
+chk_prefix=*chk
+
+# directory to archive to on HPSS -- set this to the working directory
+work_dir=`pwd`
+HPSS_DIR=`basename $work_dir`
+
+# set HTAR command
+HTAR=htar
+
+# extra arguments to HTAR
+# -P will create intermediate directories on HPSS (i.e. mkdir -p)
+HTAR_ARGS=(-H copies=2 -P)
+
+# path to the ftime executable -- used for making a simple ftime.out file
+# listing the name of the plotfile and its simulation time
+FTIME_EXE=ftime.Linux.gfortran.exe
+
+
+#----------------------------------------------------------------------------
+# initialization stuff
+
+# check to make sure that the lock file does not already exist.
+if [ -f "$jobidfile" ]; then
+  # check if job is still running
+  existing_job=$(<"$jobidfile")
+  if [ "$(sacct -X -P -n -o State -j "$existing_job")" != RUNNING ]; then
+    echo "process: removing stale lock file for job $existing_job"
+    rm "$jobidfile"
+  else
+    echo "process job $existing_job is still running"
+    exit 2
+  fi
+fi
+
+# create the lock file
+echo "$SLURM_JOB_ID" > "$jobidfile"
+
+# if our process is killed, remove the lock file first
+function cleanup {
+  echo "process: received signal; removing $jobidfile"
+  command rm -f "$jobidfile"
+  # remove the EXIT handler, since we only want to do this once
+  trap - EXIT
+  # don't exit, so we can finish the current operation:
+  # $jobidfile is checked at the start of each loop iteration in process_files()
+}
+trap cleanup EXIT HUP INT QUIT TERM XCPU
+
+# Number of seconds to sleep before checking again.
+N=60
+
+
+#----------------------------------------------------------------------------
+# make storage directories
+
+# once we process a file, we will move the plotfiles into the plotfiles/
+# directory.  This then hides them from the script, so if the system
+# later purges the files in the pltXXXXX directory and the .processed
+# file, we don't overwrite our archived data with a tarred empty
+# directory structure.  We do the same with the checkpoint files (using
+# checkfiles/)
+
+if [ ! -d plotfiles ]; then
+  mkdir plotfiles
+fi
+
+if [ ! -d checkfiles ]; then
+  mkdir checkfiles
+fi
+
+
+#----------------------------------------------------------------------------
+# the processing function
+
+# Process Files.  Once a plotfile is successfully processed, we will output
+# a file pltXXXXX.processed (checkpoint files are only archived, with a
+# chkXXXXX.processed file appearing once the archiving is successful).
+# Subsequent invocations of this routine will skip over any plotfiles or
+# checkpoint files that have a corresponding .processed file.
+
+
+function process_files
+{
+  if [ ! -f $jobidfile ]; then
+    echo "process: $jobidfile has been removed, exiting"
+    exit
+  fi
+
+
+  # plotfiles
+
+  # Take all but the final plt file -- we want to ensure they're completely
+  # written to disk.  Strip out any tar files that are lying around as well
+  # as pltXXXXX.processed files.  We restrict the find command to a depth of
+  # 1 to avoid catching any already-processed files in the plotfiles/
+  # directory
+  pltlist=($(
+    find . -maxdepth 1 -type d -name "${plt_prefix}?????" -print | sort
+    find . -maxdepth 1 -type d -name "${plt_prefix}??????" -print | sort
+    find . -maxdepth 1 -type d -name "${plt_prefix}???????" -print | sort
+  ))
+
+  # Don't process the final plt file
+  unset "pltlist[-1]"
+
+  for dir in "${pltlist[@]}"
+  do
+    if [ ! -f $jobidfile ]; then
+      echo "process: $jobidfile has been removed, exiting"
+      exit
+    fi
+    if [ -d ${dir} ]; then
+
+      # only work on the file if there is not a .processed file in the
+      # main directory or the plotfiles/ directory
+      if [ ! -f ${dir}.processed ] && [ ! -f plotfiles/${dir}.processed ]; then
+
+        # do processing
+
+        # store the file on HPSS
+        ${HTAR} "${HTAR_ARGS[@]}" -cvf ${HPSS_DIR}/${dir}.tar ${dir} > ${dir}.htar
+
+        # Ordinarily, we'd check htar's exit status (0 = successful), but
+        # on some machines (like Atlas) htar doesn't return a valid exit
+        # status.  Instead we'll grep for the success line at the end of
+        # htar's output (which we piped into a file) and check the output
+        # status of grep
+        grep "HTAR: HTAR SUCCESSFUL" ${dir}.htar >> /dev/null
+
+        # The variable $? holds the exit status of the previous command
+        if [ $? -eq 0 ]; then
+
+          # mark this file as processed so we skip it next time
+          date > ${dir}.processed
+
+          # output the plotfile name and simulation time to ftime.out
+          if [ `command -v ${FTIME_EXE}` ] ; then
+            ${FTIME_EXE} ${dir} >> ftime.out
+          fi
+
+          # remove the htar temporary file
+          rm ${dir}.htar
+
+          # move the plotfile into the plotfiles directory
+          mv ${dir} plotfiles/
+
+          # ..and the corresponding .processed file too.
+          mv ${dir}.processed plotfiles/
+
+          # and visualize it
+          #runtimevis.py plotfiles/${dir}
+
+        fi
+
+      fi   # end test of whether plotfile already processed
+
+    fi   # end test of whether plotfile is a directory (as it should be)
+
+  done
+
+
+  # checkpoint files
+
+  # Take all but the final chk file -- we want to ensure they're completely
+  # written to disk.  Strip out any tar files that are lying around as well
+  # as chkXXXXX.processed files.  We restrict the find command to a depth of
+  # 1 to avoid catching any already-processed files in the checkfiles/
+  # directory
+  chklist=($(
+    find . -maxdepth 1 -type d -name "${chk_prefix}?[05]000" -print | sort
+    find . -maxdepth 1 -type d -name "${chk_prefix}??[05]000" -print | sort
+    find . -maxdepth 1 -type d -name "${chk_prefix}???[05]000" -print | sort
+  ))
+
+  # Don't process the final chk file
+  unset "chklist[-1]"
+
+  for dir in "${chklist[@]}"
+  do
+    if [ ! -f $jobidfile ]; then
+      echo "process: $jobidfile has been removed, exiting"
+      exit
+    fi
+    if [ -d ${dir} ]; then
+
+      if [ ! -f ${dir}.processed ] && [ ! -f checkfiles/${dir}.processed ]; then
+
+        # store the file on HPSS
+        ${HTAR} "${HTAR_ARGS[@]}" -cvf ${HPSS_DIR}/${dir}.tar ${dir} > ${dir}.htar
+
+        # Ordinarily, we'd check htar's exit status (0 = successful), but
+        # on some machines (like Atlas) htar doesn't return a valid exit
+        # status.  Instead we'll grep for the success line at the end of
+        # htar's output (which we piped into a file) and check the output
+        # status of grep
+        grep "HTAR: HTAR SUCCESSFUL" ${dir}.htar >> /dev/null
+
+        # The variable $? holds the exit status of the previous command
+        if [ $? -eq 0 ]; then
+
+          # mark this file as processed so we skip it next time
+          date > ${dir}.processed
+
+          # remove the htar temporary file
+          rm ${dir}.htar
+
+          # move the checkpoint file into the checkfiles directory
+          mv ${dir} checkfiles/
+
+          # ..and the corresponding .processed file too.
+          mv ${dir}.processed checkfiles/
+
+        fi
+
+      fi
+
+    fi
+  done
+
+}
+
+
+#----------------------------------------------------------------------------
+# the main function
+
+# archive any diagnostic files first -- give them a unique name, appending
+# the date string, to make sure that we don't overwrite anything
+datestr=$(date +"%Y%m%d_%H%M_%S")
+all_files=($(
+  find . -maxdepth 1 -name "ftime.out" -print
+  find . -maxdepth 1 -name "inputs*" -print
+  find . -maxdepth 1 -name "*diag.out" -print
+  find . -maxdepth 1 -name "*.hse.*" -print
+  find . -maxdepth 1 -name "*.slurm" -print; find . -maxdepth 1 -name "*.submit" -print
+  find . -maxdepth 1 -name "process*" -print
+))
+
+${HTAR} -P -cvf ${HPSS_DIR}/diag_files_${datestr}.tar "${all_files[@]}" >> /dev/null
+
+
+# Loop, waiting for plt and chk directories to appear.
+
+while true
+do
+  process_files
+  # put sleep in the background so the shell can handle signals
+  sleep $N &
+  wait
+done
diff --git a/_sources/nersc-hpss.rst.txt b/_sources/nersc-hpss.rst.txt
@@ -9,41 +9,27 @@ frequently, since the scratch filesystems fill up and NERSC will purge
 data periodically.
 
 
-The script ``nersc.xfer.slurm``:
+The script ``nersc.xfer.slurm`` in ``job_scripts/perlmutter/``:
 
 :download:`nersc.xfer.slurm <../../job_scripts/perlmutter/nersc.xfer.slurm>`
 
 can be used to archive data to
 HPSS automatically. This is submitted to the xfer queue and runs the
-script ``process.xrb``:
+script ``process.xrb`` in ``job_scripts/hpss/``:
 
-:download:`process.xrb <../../job_scripts/perlmutter/process.xrb>`
+:download:`process.xrb <../../job_scripts/hpss/process.xrb>`
 
-which continually looks for output and stores
-it to HPSS.
+which continually looks for output and stores it to HPSS.
+By default, the destination directory on HPSS will be have the same name
+as the directory your plotfiles are located in.  This can be changed by
+editing the``$HPSS_DIR`` variable at the top of ``process.xrb``.
 
 The following describes how to use the scripts:
 
-1. Create a directory in HPSS that has the same
-   name as the directory your plotfiles are located in
-   (just the directory name, not the full path). e.g. if you are running in a directory call
-   ``/pscratch/sd/z/zingale/wdconvect/`` run, then do:
-
-   .. prompt:: bash
-
-      hsi
-      mkdir wdconvect
-
-   .. note::
-
-      If the ``hsi`` command prompts you for your password, you will need
-      to talk to the NERSC help desk to ask for password-less access to
-      HPSS.
-
-2. Copy the ``process.xrb`` script and the slurm script ``nersc.xfer.slurm``
+#. Copy the ``process.xrb`` script and the slurm script ``nersc.xfer.slurm``
    into the directory with the plotfiles.
 
-3. Submit the archive job:
+#. Submit the archive job:
 
    .. prompt:: bash
 
@@ -80,14 +66,16 @@ Some additional notes:
   the date-string to allow multiple archives to co-exist.
 
 * When ``process.xrb`` is running, it creates a lockfile (called
-  ``process.pid``) that ensures that only one instance of the script
+  ``process.jobid``) that ensures that only one instance of the script
   is running at any one time.
 
   .. warning::
 
      Sometimes if the job is not terminated normally, the
-     ``process.pid`` file will be left behind, in which case, the script
-     aborts. Just delete that if you know the script is not running.
+     ``process.jobid`` file will be left behind. Later jobs should be
+     able to detect this and clean up the stale lockfile, but if this
+     doesn't work, you can delete the file if you know the script is not
+     running.
 
 Jobs in the xfer queue start up quickly. The best approach is to start
 one as you start your main job (or make it dependent on the main