From e9a777b0e89b4d4a9b1ffa2b90daadd4c0f340e4 Mon Sep 17 00:00:00 2001 From: zingale Date: Thu, 1 Feb 2024 13:50:45 +0000 Subject: [PATCH] deploy: 11c6ff4308625d97c72bfd22c19e9245ed3c0814 --- .../nersc.xfer.slurm | 14 +- .../process.xrb | 261 ++++++++++++++++++ _sources/nersc-hpss.rst.txt | 40 +-- _sources/olcf-workflow.rst.txt | 29 +- nersc-hpss.html | 41 ++- olcf-workflow.html | 26 +- searchindex.js | 2 +- 7 files changed, 323 insertions(+), 90 deletions(-) create mode 100644 _downloads/cd10ec9dd07537d7054ccb73c28380f4/process.xrb diff --git a/_downloads/05203ba4d0d4aa143482b601abe7c8b1/nersc.xfer.slurm b/_downloads/05203ba4d0d4aa143482b601abe7c8b1/nersc.xfer.slurm index 5e2879f..41af123 100644 --- a/_downloads/05203ba4d0d4aa143482b601abe7c8b1/nersc.xfer.slurm +++ b/_downloads/05203ba4d0d4aa143482b601abe7c8b1/nersc.xfer.slurm @@ -1,17 +1,11 @@ -#!/bin/ksh +#!/bin/bash #SBATCH --qos=xfer #SBATCH -J xrb-hpss-xfer #SBATCH -t 12:00:00 #SBATCH --licenses=SCRATCH -cd $SLURM_SUBMIT_DIR - # do our archiving -pidfile=process.pid - -./process.xrb - -PID=$! -trap 'kill -s TERM $PID' EXIT TERM HUP XCPU KILL +cd "$SLURM_SUBMIT_DIR" || exit -rm -f process.pid +# use srun so any control signals get sent to the child too +srun ./process.xrb diff --git a/_downloads/cd10ec9dd07537d7054ccb73c28380f4/process.xrb b/_downloads/cd10ec9dd07537d7054ccb73c28380f4/process.xrb new file mode 100644 index 0000000..4b5f3c5 --- /dev/null +++ b/_downloads/cd10ec9dd07537d7054ccb73c28380f4/process.xrb @@ -0,0 +1,261 @@ +#!/bin/ksh -p + +#---------------------------------------------------------------------------- +# user modifiable variables: + +# jobidfile is a lock file that is used to make sure that only one instance +# of this script is working on the current directory +jobidfile=process.jobid + + +# set the prefix of the plotfiles and checkpoint files +plt_prefix=*plt +chk_prefix=*chk + +# directory to archive to on HPSS -- set this to the working directory +work_dir=`pwd` +HPSS_DIR=`basename $work_dir` + +# set HTAR command +HTAR=htar + +# extra arguments to HTAR +# -P will create intermediate directories on HPSS (i.e. mkdir -p) +HTAR_ARGS=(-H copies=2 -P) + +# path to the ftime executable -- used for making a simple ftime.out file +# listing the name of the plotfile and its simulation time +FTIME_EXE=ftime.Linux.gfortran.exe + + +#---------------------------------------------------------------------------- +# initialization stuff + +# check to make sure that the lock file does not already exist. +if [ -f "$jobidfile" ]; then + # check if job is still running + existing_job=$(<"$jobidfile") + if [ "$(sacct -X -P -n -o State -j "$existing_job")" != RUNNING ]; then + echo "process: removing stale lock file for job $existing_job" + rm "$jobidfile" + else + echo "process job $existing_job is still running" + exit 2 + fi +fi + +# create the lock file +echo "$SLURM_JOB_ID" > "$jobidfile" + +# if our process is killed, remove the lock file first +function cleanup { + echo "process: received signal; removing $jobidfile" + command rm -f "$jobidfile" + # remove the EXIT handler, since we only want to do this once + trap - EXIT + # don't exit, so we can finish the current operation: + # $jobidfile is checked at the start of each loop iteration in process_files() +} +trap cleanup EXIT HUP INT QUIT TERM XCPU + +# Number of seconds to sleep before checking again. +N=60 + + +#---------------------------------------------------------------------------- +# make storage directories + +# once we process a file, we will move the plotfiles into the plotfiles/ +# directory. This then hides them from the script, so if the system +# later purges the files in the pltXXXXX directory and the .processed +# file, we don't overwrite our archived data with a tarred empty +# directory structure. We do the same with the checkpoint files (using +# checkfiles/) + +if [ ! -d plotfiles ]; then + mkdir plotfiles +fi + +if [ ! -d checkfiles ]; then + mkdir checkfiles +fi + + +#---------------------------------------------------------------------------- +# the processing function + +# Process Files. Once a plotfile is successfully processed, we will output +# a file pltXXXXX.processed (checkpoint files are only archived, with a +# chkXXXXX.processed file appearing once the archiving is successful). +# Subsequent invocations of this routine will skip over any plotfiles or +# checkpoint files that have a corresponding .processed file. + + +function process_files +{ + if [ ! -f $jobidfile ]; then + echo "process: $jobidfile has been removed, exiting" + exit + fi + + + # plotfiles + + # Take all but the final plt file -- we want to ensure they're completely + # written to disk. Strip out any tar files that are lying around as well + # as pltXXXXX.processed files. We restrict the find command to a depth of + # 1 to avoid catching any already-processed files in the plotfiles/ + # directory + pltlist=($( + find . -maxdepth 1 -type d -name "${plt_prefix}?????" -print | sort + find . -maxdepth 1 -type d -name "${plt_prefix}??????" -print | sort + find . -maxdepth 1 -type d -name "${plt_prefix}???????" -print | sort + )) + + # Don't process the final plt file + unset "pltlist[-1]" + + for dir in "${pltlist[@]}" + do + if [ ! -f $jobidfile ]; then + echo "process: $jobidfile has been removed, exiting" + exit + fi + if [ -d ${dir} ]; then + + # only work on the file if there is not a .processed file in the + # main directory or the plotfiles/ directory + if [ ! -f ${dir}.processed ] && [ ! -f plotfiles/${dir}.processed ]; then + + # do processing + + # store the file on HPSS + ${HTAR} "${HTAR_ARGS[@]}" -cvf ${HPSS_DIR}/${dir}.tar ${dir} > ${dir}.htar + + # Ordinarily, we'd check htar's exit status (0 = successful), but + # on some machines (like Atlas) htar doesn't return a valid exit + # status. Instead we'll grep for the success line at the end of + # htar's output (which we piped into a file) and check the output + # status of grep + grep "HTAR: HTAR SUCCESSFUL" ${dir}.htar >> /dev/null + + # The variable $? holds the exit status of the previous command + if [ $? -eq 0 ]; then + + # mark this file as processed so we skip it next time + date > ${dir}.processed + + # output the plotfile name and simulation time to ftime.out + if [ `command -v ${FTIME_EXE}` ] ; then + ${FTIME_EXE} ${dir} >> ftime.out + fi + + # remove the htar temporary file + rm ${dir}.htar + + # move the plotfile into the plotfiles directory + mv ${dir} plotfiles/ + + # ..and the corresponding .processed file too. + mv ${dir}.processed plotfiles/ + + # and visualize it + #runtimevis.py plotfiles/${dir} + + fi + + fi # end test of whether plotfile already processed + + fi # end test of whether plotfile is a directory (as it should be) + + done + + + # checkpoint files + + # Take all but the final chk file -- we want to ensure they're completely + # written to disk. Strip out any tar files that are lying around as well + # as chkXXXXX.processed files. We restrict the find command to a depth of + # 1 to avoid catching any already-processed files in the checkfiles/ + # directory + chklist=($( + find . -maxdepth 1 -type d -name "${chk_prefix}?[05]000" -print | sort + find . -maxdepth 1 -type d -name "${chk_prefix}??[05]000" -print | sort + find . -maxdepth 1 -type d -name "${chk_prefix}???[05]000" -print | sort + )) + + # Don't process the final chk file + unset "chklist[-1]" + + for dir in "${chklist[@]}" + do + if [ ! -f $jobidfile ]; then + echo "process: $jobidfile has been removed, exiting" + exit + fi + if [ -d ${dir} ]; then + + if [ ! -f ${dir}.processed ] && [ ! -f checkfiles/${dir}.processed ]; then + + # store the file on HPSS + ${HTAR} "${HTAR_ARGS[@]}" -cvf ${HPSS_DIR}/${dir}.tar ${dir} > ${dir}.htar + + # Ordinarily, we'd check htar's exit status (0 = successful), but + # on some machines (like Atlas) htar doesn't return a valid exit + # status. Instead we'll grep for the success line at the end of + # htar's output (which we piped into a file) and check the output + # status of grep + grep "HTAR: HTAR SUCCESSFUL" ${dir}.htar >> /dev/null + + # The variable $? holds the exit status of the previous command + if [ $? -eq 0 ]; then + + # mark this file as processed so we skip it next time + date > ${dir}.processed + + # remove the htar temporary file + rm ${dir}.htar + + # move the checkpoint file into the checkfiles directory + mv ${dir} checkfiles/ + + # ..and the corresponding .processed file too. + mv ${dir}.processed checkfiles/ + + fi + + fi + + fi + done + +} + + +#---------------------------------------------------------------------------- +# the main function + +# archive any diagnostic files first -- give them a unique name, appending +# the date string, to make sure that we don't overwrite anything +datestr=$(date +"%Y%m%d_%H%M_%S") +all_files=($( + find . -maxdepth 1 -name "ftime.out" -print + find . -maxdepth 1 -name "inputs*" -print + find . -maxdepth 1 -name "*diag.out" -print + find . -maxdepth 1 -name "*.hse.*" -print + find . -maxdepth 1 -name "*.slurm" -print; find . -maxdepth 1 -name "*.submit" -print + find . -maxdepth 1 -name "process*" -print +)) + +${HTAR} -P -cvf ${HPSS_DIR}/diag_files_${datestr}.tar "${all_files[@]}" >> /dev/null + + +# Loop, waiting for plt and chk directories to appear. + +while true +do + process_files + # put sleep in the background so the shell can handle signals + sleep $N & + wait +done diff --git a/_sources/nersc-hpss.rst.txt b/_sources/nersc-hpss.rst.txt index 4be1df7..2784a67 100644 --- a/_sources/nersc-hpss.rst.txt +++ b/_sources/nersc-hpss.rst.txt @@ -9,41 +9,27 @@ frequently, since the scratch filesystems fill up and NERSC will purge data periodically. -The script ``nersc.xfer.slurm``: +The script ``nersc.xfer.slurm`` in ``job_scripts/perlmutter/``: :download:`nersc.xfer.slurm <../../job_scripts/perlmutter/nersc.xfer.slurm>` can be used to archive data to HPSS automatically. This is submitted to the xfer queue and runs the -script ``process.xrb``: +script ``process.xrb`` in ``job_scripts/hpss/``: -:download:`process.xrb <../../job_scripts/perlmutter/process.xrb>` +:download:`process.xrb <../../job_scripts/hpss/process.xrb>` -which continually looks for output and stores -it to HPSS. +which continually looks for output and stores it to HPSS. +By default, the destination directory on HPSS will be have the same name +as the directory your plotfiles are located in. This can be changed by +editing the``$HPSS_DIR`` variable at the top of ``process.xrb``. The following describes how to use the scripts: -1. Create a directory in HPSS that has the same - name as the directory your plotfiles are located in - (just the directory name, not the full path). e.g. if you are running in a directory call - ``/pscratch/sd/z/zingale/wdconvect/`` run, then do: - - .. prompt:: bash - - hsi - mkdir wdconvect - - .. note:: - - If the ``hsi`` command prompts you for your password, you will need - to talk to the NERSC help desk to ask for password-less access to - HPSS. - -2. Copy the ``process.xrb`` script and the slurm script ``nersc.xfer.slurm`` +#. Copy the ``process.xrb`` script and the slurm script ``nersc.xfer.slurm`` into the directory with the plotfiles. -3. Submit the archive job: +#. Submit the archive job: .. prompt:: bash @@ -80,14 +66,16 @@ Some additional notes: the date-string to allow multiple archives to co-exist. * When ``process.xrb`` is running, it creates a lockfile (called - ``process.pid``) that ensures that only one instance of the script + ``process.jobid``) that ensures that only one instance of the script is running at any one time. .. warning:: Sometimes if the job is not terminated normally, the - ``process.pid`` file will be left behind, in which case, the script - aborts. Just delete that if you know the script is not running. + ``process.jobid`` file will be left behind. Later jobs should be + able to detect this and clean up the stale lockfile, but if this + doesn't work, you can delete the file if you know the script is not + running. Jobs in the xfer queue start up quickly. The best approach is to start one as you start your main job (or make it dependent on the main diff --git a/_sources/olcf-workflow.rst.txt b/_sources/olcf-workflow.rst.txt index 3b78ff6..ab114ab 100644 --- a/_sources/olcf-workflow.rst.txt +++ b/_sources/olcf-workflow.rst.txt @@ -383,27 +383,26 @@ where ``test_hpss.sh`` is a SLURM script that contains the ``htar`` commands needed to archive your data. This uses ``slurm`` as the job manager. -An example is provided by the ``process.xrb`` archiving script and -associated ``summit_hpss.submit`` submission script in -``jobs_scripts/summit/``. Together these will detect new plotfiles as -they are generated, tar them up (using ``htar``) and archive them onto -HPSS. They will also store the inputs, probin, and other runtime -generated files. If ``ftime`` is found in your path, it will also -create a file called ``ftime.out`` that lists the simulation time -corresponding to each plotfile. +An example is provided by the ``process.xrb`` archiving script in +``job_scripts/hpss/`` and associated ``summit_hpss.submit`` submission script +in ``jobs_scripts/summit/``. Together these will detect new plotfiles as they +are generated, tar them up (using ``htar``) and archive them onto HPSS. They +will also store the inputs, probin, and other runtime generated files. If +``ftime`` is found in your path, it will also create a file called +``ftime.out`` that lists the simulation time corresponding to each plotfile. Once the plotfiles are archived they are moved to a subdirectory under your run directory called ``plotfiles/``. +By default, the files will be archived to a directory in HPSS with the same +name as the directory your plotfiles are located in. This can be changed +by editing the ``$HPSS_DIR`` variable at the top of ``process.xrb``. -To use this, we do the following: - -#. Enter the HPSS system via ``hsi`` -#. Create the output directory -- this should have the same name as the directory - you are running in on summit +To use this, we do the following: -#. Exit HPSS +#. Copy the ``process.xrb`` and ``summit_hpss.submit`` scripts into the + directory with the plotfiles. #. Launch the script via: @@ -411,7 +410,7 @@ To use this, we do the following: sbatch summit_hpss.submit - It will for the full time you asked, searching for plotfiles as + It will run for the full time you asked, searching for plotfiles as they are created and moving them to HPSS as they are produced (it will always leave the very last plotfile alone, since it can't tell if it is still being written). diff --git a/nersc-hpss.html b/nersc-hpss.html index 24f3654..b2cf3be 100644 --- a/nersc-hpss.html +++ b/nersc-hpss.html @@ -96,37 +96,26 @@

Archiving Data to HPSS -

The script nersc.xfer.slurm:

+

The script nersc.xfer.slurm in job_scripts/perlmutter/:

nersc.xfer.slurm

can be used to archive data to HPSS automatically. This is submitted to the xfer queue and runs the -script process.xrb:

-

process.xrb

-

which continually looks for output and stores -it to HPSS.

+script process.xrb in job_scripts/hpss/:

+

process.xrb

+

which continually looks for output and stores it to HPSS. +By default, the destination directory on HPSS will be have the same name +as the directory your plotfiles are located in. This can be changed by +editing the``$HPSS_DIR`` variable at the top of process.xrb.

The following describes how to use the scripts:

    -
  1. Create a directory in HPSS that has the same -name as the directory your plotfiles are located in -(just the directory name, not the full path). e.g. if you are running in a directory call -/pscratch/sd/z/zingale/wdconvect/ run, then do:

    +
  2. Copy the process.xrb script and the slurm script nersc.xfer.slurm +into the directory with the plotfiles.

  3. +
  4. Submit the archive job:

    hsi
    -mkdir wdconvect
    -
    -

    Note

    -

    If the hsi command prompts you for your password, you will need -to talk to the NERSC help desk to ask for password-less access to -HPSS.

    -
    -
  5. -
  6. Copy the process.xrb script and the slurm script nersc.xfer.slurm -into the directory with the plotfiles.

  7. -
  8. Submit the archive job:

    -
    sbatch nersc.xfer.slurm
    +sbatch nersc.xfer.slurm
     

    The script process.xrb is called from the xfer job and will run in the background and continually wait until checkpoint or plotfiles are created.

    @@ -155,13 +144,15 @@

    Archiving Data to HPSS.tar file is given a name that contains the date-string to allow multiple archives to co-exist.

  9. When process.xrb is running, it creates a lockfile (called -process.pid) that ensures that only one instance of the script +process.jobid) that ensures that only one instance of the script is running at any one time.

    Warning

    Sometimes if the job is not terminated normally, the -process.pid file will be left behind, in which case, the script -aborts. Just delete that if you know the script is not running.

    +process.jobid file will be left behind. Later jobs should be +able to detect this and clean up the stale lockfile, but if this +doesn’t work, you can delete the file if you know the script is not +running.

  10. diff --git a/olcf-workflow.html b/olcf-workflow.html index 1c83143..734fc6c 100644 --- a/olcf-workflow.html +++ b/olcf-workflow.html @@ -514,25 +514,25 @@

    Archiving to HPSStest_hpss.sh is a SLURM script that contains the htar commands needed to archive your data. This uses slurm as the job manager.

    -

    An example is provided by the process.xrb archiving script and -associated summit_hpss.submit submission script in -jobs_scripts/summit/. Together these will detect new plotfiles as -they are generated, tar them up (using htar) and archive them onto -HPSS. They will also store the inputs, probin, and other runtime -generated files. If ftime is found in your path, it will also -create a file called ftime.out that lists the simulation time -corresponding to each plotfile.

    +

    An example is provided by the process.xrb archiving script in +job_scripts/hpss/ and associated summit_hpss.submit submission script +in jobs_scripts/summit/. Together these will detect new plotfiles as they +are generated, tar them up (using htar) and archive them onto HPSS. They +will also store the inputs, probin, and other runtime generated files. If +ftime is found in your path, it will also create a file called +ftime.out that lists the simulation time corresponding to each plotfile.

    Once the plotfiles are archived they are moved to a subdirectory under your run directory called plotfiles/.

    +

    By default, the files will be archived to a directory in HPSS with the same +name as the directory your plotfiles are located in. This can be changed +by editing the $HPSS_DIR variable at the top of process.xrb.

    To use this, we do the following:

      -
    1. Enter the HPSS system via hsi

    2. -
    3. Create the output directory – this should have the same name as the directory -you are running in on summit

    4. -
    5. Exit HPSS

    6. +
    7. Copy the process.xrb and summit_hpss.submit scripts into the +directory with the plotfiles.

    8. Launch the script via:

      sbatch summit_hpss.submit
      -

      It will for the full time you asked, searching for plotfiles as +

      It will run for the full time you asked, searching for plotfiles as they are created and moving them to HPSS as they are produced (it will always leave the very last plotfile alone, since it can’t tell if it is still being written).

      diff --git a/searchindex.js b/searchindex.js index 9a4314f..23d460e 100644 --- a/searchindex.js +++ b/searchindex.js @@ -1 +1 @@ -Search.setIndex({"docnames": ["alcf", "iacs", "index", "nersc", "nersc-compilers", "nersc-hpss", "nersc-visualization", "nersc-workflow", "olcf", "olcf-andes", "olcf-compilers", "olcf-jupyter", "olcf-workflow", "workstations"], "filenames": ["alcf.rst", "iacs.rst", "index.rst", "nersc.rst", "nersc-compilers.rst", "nersc-hpss.rst", "nersc-visualization.rst", "nersc-workflow.rst", "olcf.rst", "olcf-andes.rst", "olcf-compilers.rst", "olcf-jupyter.rst", "olcf-workflow.rst", "workstations.rst"], "titles": ["Working at ALCF", "Working at IACS", "AMReX Astrophysics Suite", "Working at NERSC", "Compiling at NERSC", "Archiving Data to HPSS", "Visualization at NERSC", "Managing Jobs at NERSC", "Working at OLCF", "Batch Visualization on Andes", "Compiling at OLCF", "Running Jupyter Remotely from OLCF", "Managing Jobs at OLCF", "Linux Workstations"], "terms": {"polari": 0, "ha": [0, 1, 5, 7, 11], "560": 0, "node": [0, 1, 6, 7, 9, 12], "each": [0, 1, 7, 12], "4": [0, 1, 4, 7, 10, 12, 13], "nvidia": [0, 7], "a100": [0, 7], "gpu": [0, 2, 7, 9, 10, 12], "The": [0, 1, 5, 6, 7, 10, 12], "pb": 0, "schedul": [0, 12], "i": [0, 1, 5, 6, 7, 9, 10, 11, 12], "us": [0, 1, 4, 5, 7, 9, 10, 11, 12, 13], "ssh": [0, 1, 4, 13], "ornl": [0, 9, 10, 12], "gov": [0, 4, 6, 9, 10, 12], "To": [0, 1, 7, 11, 12, 13], "have": [0, 1, 7, 9, 10, 11, 12], "custom": 0, "bashrc": [0, 9], "creat": [0, 5, 6, 7, 8, 9, 12], "bash": [0, 1, 6, 7, 9, 12], "expert": 0, "file": [0, 1, 5, 7, 12], "add": [0, 1, 11, 12, 13], "anyth": 0, "thi": [0, 1, 5, 6, 7, 9, 10, 11, 12], "read": [0, 12], "end": [0, 7, 12], "etc": 0, "load": [0, 1, 4, 6, 7, 9, 10, 11, 12, 13], "modul": [0, 1, 4, 6, 7, 9, 10, 11, 12, 13], "swap": [0, 10], "prgenv": [0, 4, 7, 10, 12], "nvhpc": 0, "gnu": [0, 1, 4, 7, 10, 12, 13], "gcc": [0, 3, 10, 12, 13], "11": [0, 6, 10, 12, 13], "2": [0, 7, 9, 10, 12, 13], "0": [0, 1, 6, 7, 10, 11, 12], "version": [0, 1, 10, 12, 13], "sinc": [0, 5, 7, 10, 12], "cuda": [0, 3, 7, 10, 12, 13], "doesn": [0, 10, 12], "39": 0, "t": [0, 5, 6, 7, 9, 10, 12], "support": [0, 1, 9, 10, 12], "12": [0, 1], "yet": 0, "mix": [0, 10, 12], "Then": [0, 9, 10, 12], "you": [0, 1, 4, 5, 6, 7, 9, 10, 11, 12, 13], "can": [0, 1, 5, 6, 7, 9, 11, 12, 13], "via": [0, 1, 4, 7, 9, 10, 11, 12], "make": [0, 1, 4, 5, 9, 10, 11, 12, 13], "comp": [0, 1, 4, 10, 13], "use_cuda": [0, 4, 10, 13], "true": [0, 1, 4, 10, 12, 13], "project": [0, 1, 7, 9], "workspac": 0, "lu": 0, "grand": 0, "astroexplos": 0, "http": [0, 1, 2, 6, 10, 12, 13], "www": [0, 1], "anl": 0, "user": 0, "guid": [0, 1, 12], "run": [0, 1, 2, 5, 6, 7, 8, 9, 10, 12, 13], "index": 0, "html": [0, 10, 12], "For": [0, 7, 9, 12], "product": [0, 12], "prod": 0, "smallest": [0, 12], "count": 0, "seem": [0, 1, 10], "allow": [0, 5, 7, 12], "10": [0, 10, 12, 13], "clone": [0, 9], "gettingstart": 0, "repo": [0, 9], "git": [0, 1, 9], "github": [0, 2, 4, 9, 12], "com": [0, 1, 2, 9, 12], "argonn": 0, "lcf": 0, "ll": [0, 9], "want": [0, 5, 7, 9, 12], "exampl": [0, 7, 10, 12], "affinity_gpu": 0, "particular": [0, 2, 6, 7, 12], "need": [0, 1, 5, 6, 7, 9, 10, 11, 12, 13], "script": [0, 5, 6, 7, 8, 9, 10], "set_affinity_gpu_polari": 0, "sh": [0, 7, 12], "copi": [0, 5], "your": [0, 5, 6, 7, 9, 10, 11, 12, 13], "directori": [0, 5, 7, 9, 12], "here": [0, 6, 7, 9, 12], "": [0, 7, 11, 12], "submiss": [0, 5, 6, 7, 9, 10, 12], "bin": [0, 6, 7, 9, 12], "l": [0, 7, 13], "select": [0, 7, 12], "system": [0, 7, 10, 12], "place": [0, 5], "scatter": 0, "walltim": [0, 12], "30": [0, 12], "00": [0, 6, 7, 9, 12], "q": [0, 6, 7, 12], "debug": [0, 7, 8], "A": [0, 6, 7, 9, 12], "exec": [0, 12], "castro2d": [0, 7, 12], "mpi": [0, 1, 4, 7, 12], "smplsdc": [0, 7], "ex": [0, 1, 7, 12], "input": [0, 1, 5, 7, 12], "inputs_2d": [0, 7], "n14": [0, 7], "coars": [0, 7], "enabl": [0, 4], "applic": [0, 12], "export": [0, 1, 6, 7, 12], "mpich_gpu_support_en": 0, "1": [0, 1, 6, 7, 9, 12], "chang": [0, 7, 12], "cd": [0, 7, 9], "pbs_o_workdir": 0, "openmp": [0, 1, 7, 10, 12], "set": [0, 1, 7, 12], "nnode": [0, 12], "wc": 0, "pbs_nodefil": 0, "nranks_per_nod": 0, "ndepth": 0, "8": [0, 6, 7, 12], "nthread": 0, "ntotrank": 0, "mpiexec": [0, 1], "bind": [0, 7, 12], "rank": 0, "n": [0, 1, 6, 7, 9, 11, 12, 13], "ppn": 0, "depth": 0, "cpu": [0, 6, 7, 12], "env": [0, 9], "omp_num_thread": [0, 1, 6, 7, 12], "omp_plac": [0, 6, 7], "thread": [0, 1, 7, 12], "do": [0, 1, 5, 6, 7, 9, 11, 12, 13], "qsub": 0, "check": [0, 7, 12], "statu": [0, 7, 8], "qstat": 0, "u": [0, 7, 12], "usernam": [0, 7, 12], "from": [0, 4, 5, 7, 8, 9, 12], "last": [0, 7, 12], "checkpoint": [0, 5, 7, 12], "6": [0, 7, 12], "j": [0, 1, 4, 6, 7, 9, 10, 12, 13], "eo": 0, "function": [0, 7, 12], "find_chk_fil": [0, 7, 12], "take": [0, 1, 7, 12], "singl": [0, 7, 12], "argument": [0, 7, 12], "wildcard": [0, 7, 12], "pattern": [0, 7, 12], "look": [0, 5, 7, 12], "through": [0, 7, 12], "chk": [0, 7, 12], "find": [0, 1, 7, 10, 12], "latest": [0, 1, 7, 12], "wai": [0, 6, 7, 12], "didn": [0, 7, 12], "complet": [0, 7, 12], "we": [0, 1, 4, 7, 9, 12, 13], "fall": [0, 7, 12], "back": [0, 7, 12], "previou": [0, 7, 12], "one": [0, 5, 7, 12], "temp_fil": [0, 7, 12], "maxdepth": [0, 7, 12], "name": [0, 5, 6, 7, 12], "print": [0, 7, 12], "sort": [0, 7, 12], "tail": [0, 7, 12], "restartfil": [0, 7, 12], "f": [0, 7, 12], "header": [0, 7, 12, 13], "thing": [0, 7, 12], "written": [0, 5, 7, 12], "updat": [0, 7, 12], "fi": [0, 7, 12], "done": [0, 1, 5, 7, 12], "7": [0, 4, 7, 9, 10, 11, 12, 13], "digit": [0, 7, 12], "5": [0, 1, 7, 10, 11, 12], "restartstr": [0, 7, 12], "empti": [0, 7, 12], "ar": [0, 1, 4, 5, 7, 10, 12], "found": [0, 7, 12], "e": [0, 4, 5, 7, 9, 11, 12], "new": [0, 7, 9, 11, 12], "els": [0, 7, 12], "amr": [0, 1, 7, 12], "chainqsub": 0, "echo": [0, 7, 12], "usag": [0, 12], "jobid": [0, 7], "number": [0, 1, 7, 12], "initi": [0, 7, 12], "depend": [0, 5, 7, 12], "exit": [0, 7, 12], "3": [0, 1, 4, 6, 7, 9, 10, 11, 12, 13], "oldjob": 0, "numjob": 0, "gt": 0, "20": [0, 7], "too": [0, 12], "mani": [0, 12], "request": [0, 8], "firstcount": 0, "eq": 0, "start": [0, 1, 5, 7, 12], "aout": 0, "sleep": 0, "seq": 0, "w": [0, 12], "afterani": 0, "48": 1, "comput": [1, 7, 12], "core": [1, 6, 7, 12], "group": 1, "pool": 1, "actual": [1, 5, 7], "13th": 1, "o": [1, 6, 7, 12], "stuff": 1, "so": [1, 7, 10, 12, 13], "an": [1, 7, 9, 10, 12, 13], "ideal": 1, "config": [1, 11], "would": [1, 7, 9, 12], "log": [1, 4, 10, 12], "login": 1, "stonybrook": 1, "edu": [1, 13], "tell": [1, 7, 12], "about": [1, 2, 12], "machin": [1, 2, 8], "put": [1, 12], "follow": [1, 5, 6, 7, 9, 12], "local": [1, 13], "tool": [1, 5], "gnumak": 1, "raw": 1, "githubusercont": 1, "astro": [1, 10, 12, 13], "workflow": [1, 12], "main": [1, 5, 12], "job_script": [1, 12], "onli": [1, 5, 7, 12], "access": [1, 5, 7, 10, 12], "environ": [1, 6, 8, 9, 10, 12], "note": [1, 5, 12], "srun": [1, 6, 7, 9, 12], "p": [1, 9, 12], "short": 1, "pty": 1, "interact": [1, 12], "slurm": [1, 5, 7, 9, 12], "job": [1, 3, 5, 8], "time": [1, 5, 7, 12], "out": [1, 5, 6, 7, 9, 12], "after": [1, 7, 12], "hour": [1, 12], "infinit": 1, "fj": 1, "debug1": 1, "debug2": 1, "them": [1, 5, 12], "There": [1, 7, 12, 13], "cce": 1, "sve": 1, "former": 1, "newer": 1, "llvm": 1, "base": [1, 7], "ocompil": 1, "fortran": 1, "doe": [1, 9, 10], "arm": 1, "architectur": [1, 8], "latter": 1, "older": [1, 13], "even": 1, "though": 1, "both": 1, "form": [1, 12], "x": [1, 12, 13], "thei": [1, 11, 12], "differ": [1, 7], "option": [1, 7, 10, 12], "see": [1, 7, 10, 12], "commcm": 1, "faq": 1, "get": [1, 12], "php": 1, "cpe": 1, "mvapich2_nogpu": 1, "should": [1, 4, 7, 10, 12], "test": [1, 4, 7, 10, 12], "mak": 1, "recogn": 1, "switch": 1, "old": 1, "flag": [1, 12], "build": [1, 2, 4, 10, 12, 13], "24": 1, "use_mpi": [1, 4, 10, 12, 13], "fals": [1, 4, 10, 12, 13], "long": [1, 5], "At": [1, 11], "moment": [1, 10], "link": 1, "cannot": 1, "nopattern": 1, "error": [1, 12], "which": [1, 5, 7, 9, 10, 11, 12], "why": 1, "comment": 1, "abov": [1, 12], "lustr": [1, 12], "global": 1, "softwar": 1, "a64fx": 1, "modulefil": 1, "mvapich2": 1, "use_omp": [1, 4, 10], "know": [1, 5], "chip": 1, "specif": [1, 9], "mv2_enable_affin": 1, "castro3d": [1, 12], "omp": [1, 7], "3d": [1, 12], "sph": [1, 12], "max_level": 1, "max_step": 1, "These": 2, "doc": [2, 6, 10, 12], "provid": [2, 12], "inform": [2, 12], "code": [2, 9, 10, 12, 13], "nyx": 2, "maestroex": 2, "castro": [2, 4, 7, 10, 12], "variou": 2, "includ": [2, 7, 12], "hpc": [2, 12], "center": 2, "workstat": 2, "gener": [2, 12, 13], "work": [2, 9, 10, 12, 13], "alcf": 2, "nersc": [2, 5], "olcf": [2, 9], "iac": 2, "linux": 2, "compil": [3, 7, 8, 12, 13], "perlmutt": 3, "hypr": 3, "manag": [3, 8], "filesystem": [3, 5, 12], "chain": [3, 8], "archiv": [3, 8], "data": [3, 12], "hpss": [3, 8], "visual": [3, 8, 13], "p1": 4, "cudatoolkit": [4, 7], "python": [4, 6, 9, 10, 11, 12], "requir": [4, 11, 12], "process": [4, 5, 7, 12], "g": [4, 5, 11, 12], "sedov": 4, "hydro": 4, "problem": [4, 7, 12], "tiny_profil": 4, "obtain": [4, 7], "built": 4, "same": [4, 5, 7, 10, 12], "hypre_cuda_sm": 4, "80": [4, 12], "cxx": 4, "cc": 4, "fc": 4, "ftn": 4, "configur": 4, "prefix": 4, "path": [4, 5, 12, 13], "instal": [4, 6, 9, 11, 12], "unifi": 4, "memori": [4, 7, 9, 10, 12], "larg": [5, 9], "tape": 5, "librari": [5, 7, 10, 11], "store": [5, 12], "simul": [5, 12], "period": 5, "It": [5, 9, 12], "recommend": [5, 10], "move": [5, 12], "frequent": 5, "scratch": [5, 7, 12], "fill": 5, "up": [5, 7, 12], "purg": 5, "xfer": 5, "automat": [5, 12], "submit": [5, 7, 8, 9], "queue": [5, 7, 12], "xrb": [5, 9, 12], "continu": [5, 13], "output": [5, 12, 13], "describ": [5, 11, 12], "how": [5, 12], "plotfil": [5, 9, 12], "locat": [5, 7], "just": [5, 12, 13], "full": [5, 12], "call": [5, 9, 12], "pscratch": 5, "sd": 5, "z": 5, "zingal": 5, "wdconvect": 5, "hsi": [5, 12], "mkdir": 5, "If": [5, 7, 9, 11, 12], "command": [5, 12, 13], "prompt": [5, 13], "password": [5, 13], "talk": 5, "help": [5, 12], "desk": 5, "ask": [5, 12], "less": 5, "sbatch": [5, 6, 7, 9, 12], "background": [5, 7, 12], "wait": [5, 7, 12], "until": [5, 7, 12], "alwai": [5, 12], "leav": [5, 12], "most": [5, 7], "recent": [5, 6, 12], "alon": [5, 12], "mai": [5, 7, 12], "still": [5, 12], "htar": [5, 12], "wa": [5, 12], "success": 5, "subdirectori": [5, 12], "import": [5, 10, 12], "don": 5, "try": [5, 12], "second": [5, 7, 12], "overwrit": 5, "especi": 5, "took": 5, "some": [5, 12], "addit": [5, 12], "ftime": [5, 12], "execut": [5, 7, 12], "cpp": 5, "live": 5, "amrex": [5, 10, 12], "list": [5, 12], "correspond": [5, 12], "right": 5, "when": [5, 9, 10, 11, 12], "tar": [5, 12], "all": [5, 7, 12], "diagnost": 5, "given": 5, "contain": [5, 7, 12], "date": 5, "string": 5, "multipl": [5, 7], "co": 5, "exist": [5, 7, 9, 12], "lockfil": 5, "pid": [5, 7], "ensur": [5, 12], "instanc": [5, 7], "ani": [5, 7, 12], "sometim": [5, 12], "termin": [5, 13], "normal": 5, "left": [5, 7, 12], "behind": 5, "case": [5, 7, 12], "abort": [5, 12], "delet": 5, "quickli": 5, "best": [5, 6, 7, 9, 10, 11, 12], "approach": 5, "sampl": [5, 6, 12], "produc": [5, 12], "yt": [6, 9, 11], "setup": [6, 9, 10, 12], "own": 6, "conda": [6, 8, 9], "step": [6, 7, 12], "develop": 6, "languag": 6, "someth": 6, "like": [6, 7, 12], "init": [6, 9], "myenv": 6, "activ": [6, 9], "more": [6, 7, 9, 12], "c": [6, 7, 9, 10, 11, 12], "forg": [6, 9, 11], "deactiv": 6, "m3018": [6, 7], "vi": 6, "vis_": 6, "01": 6, "ntask": [6, 7, 12], "per": [6, 7, 12], "regular": [6, 7], "omp_proc_bind": [6, 7], "spread": [6, 7], "massive_star_multi": 6, "py": [6, 7, 9, 12], "plt19862": 6, "1536": 7, "therefor": [7, 12], "task": [7, 12], "otherwis": [7, 10], "fail": 7, "runtim": [7, 10, 12], "becaus": [7, 12], "below": [7, 12], "16": 7, "also": [7, 10, 11, 12], "restart": [7, 12], "logic": 7, "m3018_g": 7, "subch_": 7, "map_gpu": 7, "signal": [7, 12], "b": [7, 12], "urg": [7, 12], "castro_exec": 7, "clean": [7, 12], "over": [7, 12], "rm": [7, 12], "dump_and_stop": [7, 12], "send": [7, 12], "sigurg": [7, 12], "batch": [7, 8, 12], "minut": [7, 12], "befor": [7, 12], "limit": [7, 12], "gracefulli": [7, 12], "sig_handl": [7, 12], "touch": [7, 12], "disabl": [7, 12], "handler": [7, 12], "trap": [7, 12], "alloc": [7, 8], "soon": [7, 12], "dump": [7, 12], "stop": [7, 12], "workdir": 7, "basenam": 7, "slurm_submit_dir": [7, 9], "slack_job_start": 7, "michael": 7, "builtin": [7, 12], "shell": [7, 12], "handl": [7, 12], "64": 7, "ret": 7, "128": [7, 12], "23": 7, "receiv": [7, 12], "keep": [7, 12], "refer": 7, "distribut": [7, 11], "parallel": 7, "hyper": 7, "share": [7, 10], "socket": [7, 12], "256": [7, 12], "howev": [7, 12], "assign": 7, "physic": [7, 12], "detail": [7, 8], "instruct": 7, "within": 7, "perlmutter_script": 7, "account": 7, "qo": 7, "02": 7, "constraint": 7, "In": [7, 10, 11, 12, 13], "order": [7, 10, 11, 12], "coupl": 7, "design": [7, 12], "strategi": 7, "first": [7, 9, 11, 12], "fix": [7, 12], "next": [7, 12], "virtual": 7, "avail": [7, 12], "compos": [7, 12], "two": [7, 12], "where": [7, 12], "numa": 7, "domain": [7, 12], "lower": 7, "shortag": 7, "principl": 7, "squeez": 7, "resourc": [7, 12], "wall": 7, "clock": 7, "timestep": [7, 12], "grep": 7, "slurm_output": 7, "repeat": 7, "perfect": 7, "balanc": [7, 12], "reach": 7, "choic": 7, "compar": 7, "max_grid_s": 7, "optim": 7, "valu": [7, 12], "usual": [7, 10, 12], "half": 7, "level": [7, 9, 12], "half_siz": 7, "furthermor": 7, "sever": [7, 12], "blocking_factor": 7, "size": [7, 12], "final": [7, 9, 12], "increas": [7, 12], "scale": 7, "correctli": 7, "go": 7, "down": [7, 10], "factor": 7, "break": 7, "bigger": 7, "chainslurm": [7, 12], "mpich_max_thread_safeti": 7, "x86": [7, 12], "milan": 7, "inputs_fil": 7, "inputs_nova_t7": 7, "slurm_ntasks_per_nod": 7, "slurm_nnod": 7, "slurm_cpus_per_task": 7, "By": [7, 12], "default": [7, 10, 12, 13], "altern": [7, 12], "common": 7, "cf": 7, "everyon": 7, "squeue": [7, 12], "me": [7, 12], "estim": [7, 12], "cancel": 7, "scancel": 7, "tb": 7, "quota": 7, "showquota": 7, "finish": [7, 12], "origin": 7, "remain": [7, 12], "view": [7, 12], "id": [7, 12], "summit": [8, 11], "frontier": 8, "writ": 8, "monitor": 8, "templat": 8, "troubleshoot": 8, "jupyt": 8, "remot": 8, "Andes": 8, "andes": 9, "andes_env": 9, "anaconda": 9, "anaconda3": [9, 11], "modifi": [9, 12], "ad": 9, "y": [9, 11], "ipykernel": [9, 11], "nb_conda_kernel": [9, 11], "sourc": 9, "top": 9, "pip": 9, "uninstal": 9, "ast106": [9, 12], "plot": 9, "vol": 9, "enuc": 9, "flame_wave_1000hz_25cm_smallplt203204": 9, "veri": [9, 12], "might": 9, "solut": 9, "accomplish": 9, "xl": 10, "atleast": 10, "due": 10, "17": 10, "won": [10, 12], "present": 10, "warn": [10, 12], "packag": 10, "fine": 10, "sure": [10, 11, 12], "current": [10, 12], "disallow": 10, "line": [10, 12], "pair": 10, "offload": 10, "control": 10, "use_omp_offload": 10, "featur": [10, 12], "frontier_user_guid": [10, 12], "program": 10, "crayp": [10, 12], "accel": [10, 12], "amd": [10, 12], "gfx90a": [10, 12], "crai": [10, 12], "mpich": [10, 12], "rocm": [10, 12], "higher": 10, "issu": 10, "burner": 10, "tabul": 10, "rate": 10, "exhibit": 10, "strang": 10, "slow": 10, "without": [10, 12, 13], "use_hip": [10, 12], "jupyterhub": 11, "document": [11, 12], "extra": 11, "part": 11, "notebook": 11, "wish": 11, "correct": [11, 12], "point": [11, 12, 13], "good": [11, 12], "idea": [11, 12], "my_env": 11, "jupyterlab": 11, "subsequ": 11, "channel": 11, "search": [11, 12], "let": 12, "review": 12, "our": 12, "goal": 12, "necessari": 12, "insight": 12, "better": 12, "decis": 12, "construct": 12, "explain": 12, "expos": 12, "section": 12, "condens": 12, "replac": 12, "21": 12, "reserv": 12, "ram": 12, "bank": 12, "connect": [12, 13], "bu": 12, "commun": 12, "among": 12, "defin": 12, "whole": 12, "structur": 12, "depict": 12, "figur": 12, "extract": 12, "summit_user_guid": 12, "launcher": 12, "jsrun": 12, "minim": 12, "collect": 12, "certain": 12, "oper": 12, "extend": 12, "discuss": 12, "now": 12, "determin": 12, "maximum": 12, "fit": 12, "accord": 12, "summon": 12, "bsub": 12, "descript": 12, "perform": 12, "calcul": 12, "format": 12, "room": 12, "03": 12, "three": 12, "alloc_flag": 12, "smt4": 12, "consid": 12, "smt1": 12, "stand": 12, "interest": 12, "standard": 12, "stream": 12, "insid": 12, "similar": 12, "suppli": 12, "assum": 12, "between": 12, "small": 12, "smoothli": 12, "bug": [12, 13], "unix": 12, "mention": 12, "stdout_to_show": 12, "stderr_to_show": 12, "No": 12, "onc": 12, "grant": 12, "variabl": 12, "total": 12, "r": 12, "max": 12, "a1": 12, "c1": 12, "g1": 12, "r6": 12, "placehold": 12, "respect": 12, "match": 12, "box": 12, "grid": 12, "biggest": 12, "piec": 12, "32768": 12, "cell": 12, "100": 12, "131072": 12, "524288": 12, "32": 12, "2097152": 12, "7864320": 12, "93": 12, "75": 12, "480": 12, "30408704": 12, "90": 12, "625": 12, "assert": 12, "equival": 12, "impli": 12, "398": 12, "idl": 12, "sweep": 12, "entir": 12, "possibl": 12, "maxim": 12, "life": 12, "easier": 12, "instead": 12, "write": 12, "anoth": 12, "pack": 12, "statement": 12, "luna_script": 12, "luna_output": 12, "luna_sniffing_output": 12, "inputs_luna": 12, "n_re": 12, "n_cpu_cores_per_r": 12, "n_max_res_per_nod": 12, "n_mpi_per_r": 12, "n_gpu_per_r": 12, "downgrad": 12, "kill": 12, "As": 12, "week": 12, "month": 12, "mayb": 12, "year": 12, "come": 12, "salvat": 12, "mandatori": 12, "chkxxxxxxx": 12, "chkxxxxxx": 12, "chkxxxxx": 12, "implement": 12, "append": 12, "minimum": 12, "pick": 12, "amount": 12, "expir": 12, "pass": 12, "wt": 12, "cleanli": 12, "couldn": 12, "anywher": 12, "ignor": 12, "immedi": 12, "crash": 12, "upon": 12, "sigchld": 12, "sigwinch": 12, "least": 12, "trigger": 12, "other": 12, "event": 12, "launch": [12, 13], "must": 12, "jswait": 12, "save": 12, "bjob": 12, "slightli": 12, "nicer": 12, "jobstat": 12, "lead": 12, "enviro": 12, "simultan": 12, "n_res_1": 12, "n_res2": 12, "give": 12, "avoid": 12, "head": 12, "quiet": 12, "level_": 12, "dev": 12, "null": 12, "warning_tim": 12, "nohead": 12, "action_warning_tim": 12, "lsb_jobid": 12, "chain_submit": 12, "submit_script": 12, "return": 12, "transfer": 12, "15": 12, "cluster": 12, "dtn": 12, "test_hpss": 12, "associ": 12, "summit_hpss": 12, "jobs_script": 12, "togeth": 12, "detect": 12, "onto": 12, "probin": 12, "under": 12, "enter": [12, 13], "being": 12, "unarchiv": 12, "bulk": 12, "hpss_xfer": 12, "plt00000": 12, "hpss_dir": 12, "plotfile_dir": 12, "fetch": 12, "unpack": 12, "attempt": 12, "recov": 12, "titan": 12, "polici": 12, "orion": 12, "storag": 12, "05": 12, "closest": 12, "hip": 12, "trento": 12, "nmpi_per_nod": 12, "total_nmpi": 12, "slurm_job_num_nod": 12, "june": 12, "2023": 12, "explicitli": 12, "blob": 12, "warpx": 12, "readthedoc": 12, "io": 12, "en": 12, "queu": 12, "rocgdb": 12, "27": 12, "turn": 12, "startup": 12, "session": 12, "salloc": 12, "mz": 12, "restor": 12, "reload": 12, "hip_enable_deferred_load": 12, "amd_serialize_kernel": 12, "amd_serialize_copi": 12, "amd_log_level": 12, "lot": 12, "debugg": 12, "pagin": 12, "off": 12, "trace": 12, "interrupt": 12, "bt": 12, "workaround": 12, "prevent": 12, "hang": 12, "fi_mr_cache_monitor": 12, "memhook": 12, "report": 12, "arena": 12, "big": 12, "the_arena_init_s": 12, "grow": 12, "suggest": 12, "larger": 12, "than": 12, "well": 13, "nvcc": 13, "cuda_vers": 13, "cc60": 13, "compile_cuda_path": 13, "usr": 13, "no_device_launch": 13, "around": 13, "cc70": 13, "On": 13, "lab": 13, "browser": 13, "ip": 13, "localhost": 13, "8888": 13, "sunysb": 13, "window": 13, "web": 13, "token": 13, "appear": 13}, "objects": {}, "objtypes": {}, "objnames": {}, "titleterms": {"work": [0, 1, 3, 8], "alcf": 0, "log": 0, "In": 0, "compil": [0, 1, 4, 10], "disk": 0, "queue": 0, "submit": [0, 12], "automat": 0, "restart": 0, "job": [0, 7, 12], "chain": [0, 7, 12], "iac": 1, "ookami": 1, "amrex": [1, 2], "setup": 1, "crai": 1, "gcc": [1, 4], "10": 1, "2": 1, "astrophys": 2, "suit": 2, "astro": 2, "basic": 2, "nersc": [3, 4, 6, 7], "perlmutt": [4, 7], "cuda": 4, "hypr": 4, "archiv": [5, 12], "data": 5, "hpss": [5, 12], "visual": [6, 9], "manag": [7, 12], "filesystem": 7, "olcf": [8, 10, 11, 12], "batch": 9, "Andes": 9, "summit": [10, 12], "frontier": [10, 12], "run": 11, "jupyt": [11, 13], "remot": [11, 13], "from": 11, "creat": 11, "conda": 11, "environ": 11, "architectur": 12, "request": 12, "alloc": 12, "writ": 12, "script": 12, "monitor": 12, "templat": 12, "machin": 12, "detail": 12, "statu": 12, "debug": 12, "troubleshoot": 12, "linux": 13, "workstat": 13, "gpu": 13, "offload": 13, "bender": 13, "groot": 13, "vi": 13}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1, "sphinx": 60}, "alltitles": {"Working at ALCF": [[0, "working-at-alcf"]], "Logging In": [[0, "logging-in"]], "Compiling": [[0, "compiling"]], "Disks": [[0, "disks"]], "Queues": [[0, "queues"]], "Submitting": [[0, "submitting"]], "Automatic Restarting": [[0, "automatic-restarting"]], "Job Chaining": [[0, "job-chaining"], [12, "job-chaining"]], "Working at IACS": [[1, "working-at-iacs"]], "Ookami": [[1, "ookami"]], "AMReX setup": [[1, "amrex-setup"]], "Cray compilers": [[1, "cray-compilers"]], "GCC": [[1, "gcc"]], "GCC 10.2": [[1, "gcc-10-2"]], "AMReX Astrophysics Suite": [[2, "amrex-astrophysics-suite"]], "AMReX Astro basics": [[2, null]], "Working at NERSC": [[3, "working-at-nersc"]], "Compiling at NERSC": [[4, "compiling-at-nersc"]], "Perlmutter": [[4, "perlmutter"], [7, "perlmutter"]], "Compiling with GCC + CUDA": [[4, "compiling-with-gcc-cuda"]], "Hypre": [[4, "hypre"]], "Archiving Data to HPSS": [[5, "archiving-data-to-hpss"]], "Visualization at NERSC": [[6, "visualization-at-nersc"]], "Managing Jobs at NERSC": [[7, "managing-jobs-at-nersc"]], "Filesystems": [[7, "filesystems"]], "Chaining": [[7, "chaining"]], "Working at OLCF": [[8, "working-at-olcf"]], "Batch Visualization on Andes": [[9, "batch-visualization-on-andes"]], "Compiling at OLCF": [[10, "compiling-at-olcf"]], "Summit": [[10, "summit"], [12, "summit"]], "Frontier": [[10, "frontier"], [12, "frontier"]], "Running Jupyter Remotely from OLCF": [[11, "running-jupyter-remotely-from-olcf"]], "Creating a conda environment": [[11, "creating-a-conda-environment"]], "Managing Jobs at OLCF": [[12, "managing-jobs-at-olcf"]], "Summit Architecture:": [[12, "summit-architecture"]], "Requesting Allocation:": [[12, "requesting-allocation"]], "Submitting a Job:": [[12, "submitting-a-job"]], "Writting a Job Script:": [[12, "writting-a-job-script"]], "Monitoring a Job:": [[12, "monitoring-a-job"]], "Script Template:": [[12, "script-template"]], "Chaining jobs": [[12, "chaining-jobs"]], "Archiving to HPSS": [[12, "archiving-to-hpss"]], "Machine details": [[12, "machine-details"]], "Submitting jobs": [[12, "submitting-jobs"]], "Job Status": [[12, "job-status"]], "Debugging": [[12, "debugging"]], "Troubleshooting": [[12, "troubleshooting"]], "Linux Workstations": [[13, "linux-workstations"]], "GPU offloading": [[13, "gpu-offloading"]], "bender": [[13, "bender"]], "groot": [[13, "groot"]], "Remote vis with Jupyter": [[13, "remote-vis-with-jupyter"]]}, "indexentries": {}}) \ No newline at end of file +Search.setIndex({"docnames": ["alcf", "iacs", "index", "nersc", "nersc-compilers", "nersc-hpss", "nersc-visualization", "nersc-workflow", "olcf", "olcf-andes", "olcf-compilers", "olcf-jupyter", "olcf-workflow", "workstations"], "filenames": ["alcf.rst", "iacs.rst", "index.rst", "nersc.rst", "nersc-compilers.rst", "nersc-hpss.rst", "nersc-visualization.rst", "nersc-workflow.rst", "olcf.rst", "olcf-andes.rst", "olcf-compilers.rst", "olcf-jupyter.rst", "olcf-workflow.rst", "workstations.rst"], "titles": ["Working at ALCF", "Working at IACS", "AMReX Astrophysics Suite", "Working at NERSC", "Compiling at NERSC", "Archiving Data to HPSS", "Visualization at NERSC", "Managing Jobs at NERSC", "Working at OLCF", "Batch Visualization on Andes", "Compiling at OLCF", "Running Jupyter Remotely from OLCF", "Managing Jobs at OLCF", "Linux Workstations"], "terms": {"polari": 0, "ha": [0, 1, 7, 11], "560": 0, "node": [0, 1, 6, 7, 9, 12], "each": [0, 1, 7, 12], "4": [0, 1, 4, 7, 10, 12, 13], "nvidia": [0, 7], "a100": [0, 7], "gpu": [0, 2, 7, 9, 10, 12], "The": [0, 1, 5, 6, 7, 10, 12], "pb": 0, "schedul": [0, 12], "i": [0, 1, 5, 6, 7, 9, 10, 11, 12], "us": [0, 1, 4, 5, 7, 9, 10, 11, 12, 13], "ssh": [0, 1, 4, 13], "ornl": [0, 9, 10, 12], "gov": [0, 4, 6, 9, 10, 12], "To": [0, 1, 7, 11, 12, 13], "have": [0, 1, 5, 7, 9, 10, 11, 12], "custom": 0, "bashrc": [0, 9], "creat": [0, 5, 6, 7, 8, 9, 12], "bash": [0, 1, 6, 7, 9, 12], "expert": 0, "file": [0, 1, 5, 7, 12], "add": [0, 1, 11, 12, 13], "anyth": 0, "thi": [0, 1, 5, 6, 7, 9, 10, 11, 12], "read": [0, 12], "end": [0, 7, 12], "etc": 0, "load": [0, 1, 4, 6, 7, 9, 10, 11, 12, 13], "modul": [0, 1, 4, 6, 7, 9, 10, 11, 12, 13], "swap": [0, 10], "prgenv": [0, 4, 7, 10, 12], "nvhpc": 0, "gnu": [0, 1, 4, 7, 10, 12, 13], "gcc": [0, 3, 10, 12, 13], "11": [0, 6, 10, 12, 13], "2": [0, 7, 9, 10, 12, 13], "0": [0, 1, 6, 7, 10, 11, 12], "version": [0, 1, 10, 12, 13], "sinc": [0, 5, 7, 10, 12], "cuda": [0, 3, 7, 10, 12, 13], "doesn": [0, 5, 10, 12], "39": 0, "t": [0, 5, 6, 7, 9, 10, 12], "support": [0, 1, 9, 10, 12], "12": [0, 1], "yet": 0, "mix": [0, 10, 12], "Then": [0, 9, 10, 12], "you": [0, 1, 4, 5, 6, 7, 9, 10, 11, 12, 13], "can": [0, 1, 5, 6, 7, 9, 11, 12, 13], "via": [0, 1, 4, 7, 9, 10, 11, 12], "make": [0, 1, 4, 5, 9, 10, 11, 12, 13], "comp": [0, 1, 4, 10, 13], "use_cuda": [0, 4, 10, 13], "true": [0, 1, 4, 10, 12, 13], "project": [0, 1, 7, 9], "workspac": 0, "lu": 0, "grand": 0, "astroexplos": 0, "http": [0, 1, 2, 6, 10, 12, 13], "www": [0, 1], "anl": 0, "user": 0, "guid": [0, 1, 12], "run": [0, 1, 2, 5, 6, 7, 8, 9, 10, 12, 13], "index": 0, "html": [0, 10, 12], "For": [0, 7, 9, 12], "product": [0, 12], "prod": 0, "smallest": [0, 12], "count": 0, "seem": [0, 1, 10], "allow": [0, 5, 7, 12], "10": [0, 10, 12, 13], "clone": [0, 9], "gettingstart": 0, "repo": [0, 9], "git": [0, 1, 9], "github": [0, 2, 4, 9, 12], "com": [0, 1, 2, 9, 12], "argonn": 0, "lcf": 0, "ll": [0, 9], "want": [0, 5, 7, 9, 12], "exampl": [0, 7, 10, 12], "affinity_gpu": 0, "particular": [0, 2, 6, 7, 12], "need": [0, 1, 6, 7, 9, 10, 11, 12, 13], "script": [0, 5, 6, 7, 8, 9, 10], "set_affinity_gpu_polari": 0, "sh": [0, 7, 12], "copi": [0, 5, 12], "your": [0, 5, 6, 7, 9, 10, 11, 12, 13], "directori": [0, 5, 7, 9, 12], "here": [0, 6, 7, 9, 12], "": [0, 7, 11, 12], "submiss": [0, 5, 6, 7, 9, 10, 12], "bin": [0, 6, 7, 9, 12], "l": [0, 7, 13], "select": [0, 7, 12], "system": [0, 7, 10, 12], "place": [0, 5], "scatter": 0, "walltim": [0, 12], "30": [0, 12], "00": [0, 6, 7, 9, 12], "q": [0, 6, 7, 12], "debug": [0, 7, 8], "A": [0, 6, 7, 9, 12], "exec": [0, 12], "castro2d": [0, 7, 12], "mpi": [0, 1, 4, 7, 12], "smplsdc": [0, 7], "ex": [0, 1, 7, 12], "input": [0, 1, 5, 7, 12], "inputs_2d": [0, 7], "n14": [0, 7], "coars": [0, 7], "enabl": [0, 4], "applic": [0, 12], "export": [0, 1, 6, 7, 12], "mpich_gpu_support_en": 0, "1": [0, 1, 6, 7, 9, 12], "chang": [0, 5, 7, 12], "cd": [0, 7, 9], "pbs_o_workdir": 0, "openmp": [0, 1, 7, 10, 12], "set": [0, 1, 7, 12], "nnode": [0, 12], "wc": 0, "pbs_nodefil": 0, "nranks_per_nod": 0, "ndepth": 0, "8": [0, 6, 7, 12], "nthread": 0, "ntotrank": 0, "mpiexec": [0, 1], "bind": [0, 7, 12], "rank": 0, "n": [0, 1, 6, 7, 9, 11, 12, 13], "ppn": 0, "depth": 0, "cpu": [0, 6, 7, 12], "env": [0, 9], "omp_num_thread": [0, 1, 6, 7, 12], "omp_plac": [0, 6, 7], "thread": [0, 1, 7, 12], "do": [0, 1, 6, 7, 9, 11, 12, 13], "qsub": 0, "check": [0, 7, 12], "statu": [0, 7, 8], "qstat": 0, "u": [0, 7, 12], "usernam": [0, 7, 12], "from": [0, 4, 5, 7, 8, 9, 12], "last": [0, 7, 12], "checkpoint": [0, 5, 7, 12], "6": [0, 7, 12], "j": [0, 1, 4, 6, 7, 9, 10, 12, 13], "eo": 0, "function": [0, 7, 12], "find_chk_fil": [0, 7, 12], "take": [0, 1, 7, 12], "singl": [0, 7, 12], "argument": [0, 7, 12], "wildcard": [0, 7, 12], "pattern": [0, 7, 12], "look": [0, 5, 7, 12], "through": [0, 7, 12], "chk": [0, 7, 12], "find": [0, 1, 7, 10, 12], "latest": [0, 1, 7, 12], "wai": [0, 6, 7, 12], "didn": [0, 7, 12], "complet": [0, 7, 12], "we": [0, 1, 4, 7, 9, 12, 13], "fall": [0, 7, 12], "back": [0, 7, 12], "previou": [0, 7, 12], "one": [0, 5, 7, 12], "temp_fil": [0, 7, 12], "maxdepth": [0, 7, 12], "name": [0, 5, 6, 7, 12], "print": [0, 7, 12], "sort": [0, 7, 12], "tail": [0, 7, 12], "restartfil": [0, 7, 12], "f": [0, 7, 12], "header": [0, 7, 12, 13], "thing": [0, 7, 12], "written": [0, 5, 7, 12], "updat": [0, 7, 12], "fi": [0, 7, 12], "done": [0, 1, 5, 7, 12], "7": [0, 4, 7, 9, 10, 11, 12, 13], "digit": [0, 7, 12], "5": [0, 1, 7, 10, 11, 12], "restartstr": [0, 7, 12], "empti": [0, 7, 12], "ar": [0, 1, 4, 5, 7, 10, 12], "found": [0, 7, 12], "e": [0, 4, 7, 9, 11, 12], "new": [0, 7, 9, 11, 12], "els": [0, 7, 12], "amr": [0, 1, 7, 12], "chainqsub": 0, "echo": [0, 7, 12], "usag": [0, 12], "jobid": [0, 5, 7], "number": [0, 1, 7, 12], "initi": [0, 7, 12], "depend": [0, 5, 7, 12], "exit": [0, 7, 12], "3": [0, 1, 4, 6, 7, 9, 10, 11, 12, 13], "oldjob": 0, "numjob": 0, "gt": 0, "20": [0, 7], "too": [0, 12], "mani": [0, 12], "request": [0, 8], "firstcount": 0, "eq": 0, "start": [0, 1, 5, 7, 12], "aout": 0, "sleep": 0, "seq": 0, "w": [0, 12], "afterani": 0, "48": 1, "comput": [1, 7, 12], "core": [1, 6, 7, 12], "group": 1, "pool": 1, "actual": [1, 5, 7], "13th": 1, "o": [1, 6, 7, 12], "stuff": 1, "so": [1, 7, 10, 12, 13], "an": [1, 7, 9, 10, 12, 13], "ideal": 1, "config": [1, 11], "would": [1, 7, 9, 12], "log": [1, 4, 10, 12], "login": 1, "stonybrook": 1, "edu": [1, 13], "tell": [1, 7, 12], "about": [1, 2, 12], "machin": [1, 2, 8], "put": [1, 12], "follow": [1, 5, 6, 7, 9, 12], "local": [1, 13], "tool": [1, 5], "gnumak": 1, "raw": 1, "githubusercont": 1, "astro": [1, 10, 12, 13], "workflow": [1, 12], "main": [1, 5, 12], "job_script": [1, 5, 12], "onli": [1, 5, 7, 12], "access": [1, 7, 10, 12], "environ": [1, 6, 8, 9, 10, 12], "note": [1, 5, 12], "srun": [1, 6, 7, 9, 12], "p": [1, 9, 12], "short": 1, "pty": 1, "interact": [1, 12], "slurm": [1, 5, 7, 9, 12], "job": [1, 3, 5, 8], "time": [1, 5, 7, 12], "out": [1, 5, 6, 7, 9, 12], "after": [1, 7, 12], "hour": [1, 12], "infinit": 1, "fj": 1, "debug1": 1, "debug2": 1, "them": [1, 5, 12], "There": [1, 7, 12, 13], "cce": 1, "sve": 1, "former": 1, "newer": 1, "llvm": 1, "base": [1, 7], "ocompil": 1, "fortran": 1, "doe": [1, 9, 10], "arm": 1, "architectur": [1, 8], "latter": 1, "older": [1, 13], "even": 1, "though": 1, "both": 1, "form": [1, 12], "x": [1, 12, 13], "thei": [1, 11, 12], "differ": [1, 7], "option": [1, 7, 10, 12], "see": [1, 7, 10, 12], "commcm": 1, "faq": 1, "get": [1, 12], "php": 1, "cpe": 1, "mvapich2_nogpu": 1, "should": [1, 4, 5, 7, 10, 12], "test": [1, 4, 7, 10, 12], "mak": 1, "recogn": 1, "switch": 1, "old": 1, "flag": [1, 12], "build": [1, 2, 4, 10, 12, 13], "24": 1, "use_mpi": [1, 4, 10, 12, 13], "fals": [1, 4, 10, 12, 13], "long": [1, 5], "At": [1, 11], "moment": [1, 10], "link": 1, "cannot": 1, "nopattern": 1, "error": [1, 12], "which": [1, 5, 7, 9, 10, 11, 12], "why": 1, "comment": 1, "abov": [1, 12], "lustr": [1, 12], "global": 1, "softwar": 1, "a64fx": 1, "modulefil": 1, "mvapich2": 1, "use_omp": [1, 4, 10], "know": [1, 5], "chip": 1, "specif": [1, 9], "mv2_enable_affin": 1, "castro3d": [1, 12], "omp": [1, 7], "3d": [1, 12], "sph": [1, 12], "max_level": 1, "max_step": 1, "These": 2, "doc": [2, 6, 10, 12], "provid": [2, 12], "inform": [2, 12], "code": [2, 9, 10, 12, 13], "nyx": 2, "maestroex": 2, "castro": [2, 4, 7, 10, 12], "variou": 2, "includ": [2, 7, 12], "hpc": [2, 12], "center": 2, "workstat": 2, "gener": [2, 12, 13], "work": [2, 5, 9, 10, 12, 13], "alcf": 2, "nersc": [2, 5], "olcf": [2, 9], "iac": 2, "linux": 2, "compil": [3, 7, 8, 12, 13], "perlmutt": [3, 5], "hypr": 3, "manag": [3, 8], "filesystem": [3, 5, 12], "chain": [3, 8], "archiv": [3, 8], "data": [3, 12], "hpss": [3, 8], "visual": [3, 8, 13], "p1": 4, "cudatoolkit": [4, 7], "python": [4, 6, 9, 10, 11, 12], "requir": [4, 11, 12], "process": [4, 5, 7, 12], "g": [4, 11, 12], "sedov": 4, "hydro": 4, "problem": [4, 7, 12], "tiny_profil": 4, "obtain": [4, 7], "built": 4, "same": [4, 5, 7, 10, 12], "hypre_cuda_sm": 4, "80": [4, 12], "cxx": 4, "cc": 4, "fc": 4, "ftn": 4, "configur": 4, "prefix": 4, "path": [4, 5, 12, 13], "instal": [4, 6, 9, 11, 12], "unifi": 4, "memori": [4, 7, 9, 10, 12], "larg": [5, 9], "tape": 5, "librari": [5, 7, 10, 11], "store": [5, 12], "simul": [5, 12], "period": 5, "It": [5, 9, 12], "recommend": [5, 10], "move": [5, 12], "frequent": 5, "scratch": [5, 7, 12], "fill": 5, "up": [5, 7, 12], "purg": 5, "xfer": 5, "automat": [5, 12], "submit": [5, 7, 8, 9], "queue": [5, 7, 12], "xrb": [5, 9, 12], "continu": [5, 13], "output": [5, 12, 13], "By": [5, 7, 12], "default": [5, 7, 10, 12, 13], "destin": 5, "plotfil": [5, 9, 12], "locat": [5, 7, 12], "edit": [5, 12], "hpss_dir": [5, 12], "variabl": [5, 12], "top": [5, 9, 12], "describ": [5, 11, 12], "how": [5, 12], "sbatch": [5, 6, 7, 9, 12], "call": [5, 9, 12], "background": [5, 7, 12], "wait": [5, 7, 12], "until": [5, 7, 12], "alwai": [5, 12], "leav": [5, 12], "most": [5, 7], "recent": [5, 6, 12], "alon": [5, 12], "mai": [5, 7, 12], "still": [5, 12], "htar": [5, 12], "If": [5, 7, 9, 11, 12], "command": [5, 12, 13], "wa": [5, 12], "success": 5, "subdirectori": [5, 12], "import": [5, 10, 12], "don": 5, "try": [5, 12], "second": [5, 7, 12], "overwrit": 5, "especi": 5, "took": 5, "some": [5, 12], "addit": [5, 12], "ftime": [5, 12], "execut": [5, 7, 12], "cpp": 5, "live": 5, "amrex": [5, 10, 12], "list": [5, 12], "correspond": [5, 12], "right": 5, "when": [5, 9, 10, 11, 12], "tar": [5, 12], "all": [5, 7, 12], "diagnost": 5, "given": 5, "contain": [5, 7, 12], "date": 5, "string": 5, "multipl": [5, 7], "co": 5, "exist": [5, 7, 9, 12], "lockfil": 5, "ensur": [5, 12], "instanc": [5, 7], "ani": [5, 7, 12], "sometim": [5, 12], "termin": [5, 13], "normal": 5, "left": [5, 7, 12], "behind": 5, "later": 5, "abl": 5, "detect": [5, 12], "clean": [5, 7, 12], "stale": 5, "delet": 5, "quickli": 5, "best": [5, 6, 7, 9, 10, 11, 12], "approach": 5, "sampl": [5, 6, 12], "produc": [5, 12], "yt": [6, 9, 11], "setup": [6, 9, 10, 12], "own": 6, "conda": [6, 8, 9], "step": [6, 7, 12], "develop": 6, "languag": 6, "someth": 6, "like": [6, 7, 12], "init": [6, 9], "myenv": 6, "activ": [6, 9], "more": [6, 7, 9, 12], "c": [6, 7, 9, 10, 11, 12], "forg": [6, 9, 11], "deactiv": 6, "m3018": [6, 7], "vi": 6, "vis_": 6, "01": 6, "ntask": [6, 7, 12], "per": [6, 7, 12], "regular": [6, 7], "omp_proc_bind": [6, 7], "spread": [6, 7], "massive_star_multi": 6, "py": [6, 7, 9, 12], "plt19862": 6, "1536": 7, "therefor": [7, 12], "task": [7, 12], "otherwis": [7, 10], "fail": 7, "runtim": [7, 10, 12], "becaus": [7, 12], "below": [7, 12], "16": 7, "also": [7, 10, 11, 12], "restart": [7, 12], "logic": 7, "m3018_g": 7, "subch_": 7, "map_gpu": 7, "signal": [7, 12], "b": [7, 12], "urg": [7, 12], "castro_exec": 7, "over": [7, 12], "rm": [7, 12], "dump_and_stop": [7, 12], "send": [7, 12], "sigurg": [7, 12], "batch": [7, 8, 12], "minut": [7, 12], "befor": [7, 12], "limit": [7, 12], "gracefulli": [7, 12], "sig_handl": [7, 12], "touch": [7, 12], "disabl": [7, 12], "handler": [7, 12], "trap": [7, 12], "alloc": [7, 8], "soon": [7, 12], "dump": [7, 12], "stop": [7, 12], "workdir": 7, "basenam": 7, "slurm_submit_dir": [7, 9], "slack_job_start": 7, "michael": 7, "builtin": [7, 12], "shell": [7, 12], "handl": [7, 12], "64": 7, "pid": 7, "ret": 7, "128": [7, 12], "23": 7, "receiv": [7, 12], "keep": [7, 12], "refer": 7, "distribut": [7, 11], "parallel": 7, "hyper": 7, "share": [7, 10], "socket": [7, 12], "256": [7, 12], "howev": [7, 12], "assign": 7, "physic": [7, 12], "detail": [7, 8], "instruct": 7, "within": 7, "perlmutter_script": 7, "account": 7, "qo": 7, "02": 7, "constraint": 7, "In": [7, 10, 11, 12, 13], "order": [7, 10, 11, 12], "coupl": 7, "design": [7, 12], "strategi": 7, "first": [7, 9, 11, 12], "fix": [7, 12], "next": [7, 12], "virtual": 7, "avail": [7, 12], "compos": [7, 12], "two": [7, 12], "where": [7, 12], "numa": 7, "domain": [7, 12], "lower": 7, "case": [7, 12], "shortag": 7, "principl": 7, "squeez": 7, "resourc": [7, 12], "wall": 7, "clock": 7, "timestep": [7, 12], "grep": 7, "slurm_output": 7, "repeat": 7, "perfect": 7, "balanc": [7, 12], "reach": 7, "choic": 7, "compar": 7, "max_grid_s": 7, "optim": 7, "valu": [7, 12], "usual": [7, 10, 12], "half": 7, "level": [7, 9, 12], "half_siz": 7, "furthermor": 7, "sever": [7, 12], "blocking_factor": 7, "size": [7, 12], "final": [7, 9, 12], "increas": [7, 12], "scale": 7, "correctli": 7, "go": 7, "down": [7, 10], "factor": 7, "break": 7, "bigger": 7, "chainslurm": [7, 12], "mpich_max_thread_safeti": 7, "x86": [7, 12], "milan": 7, "inputs_fil": 7, "inputs_nova_t7": 7, "slurm_ntasks_per_nod": 7, "slurm_nnod": 7, "slurm_cpus_per_task": 7, "altern": [7, 12], "common": 7, "cf": 7, "everyon": 7, "squeue": [7, 12], "me": [7, 12], "estim": [7, 12], "cancel": 7, "scancel": 7, "tb": 7, "quota": 7, "showquota": 7, "finish": [7, 12], "origin": 7, "remain": [7, 12], "view": [7, 12], "id": [7, 12], "summit": [8, 11], "frontier": 8, "writ": 8, "monitor": 8, "templat": 8, "troubleshoot": 8, "jupyt": 8, "remot": 8, "Andes": 8, "andes": 9, "andes_env": 9, "anaconda": 9, "anaconda3": [9, 11], "modifi": [9, 12], "ad": 9, "y": [9, 11], "ipykernel": [9, 11], "nb_conda_kernel": [9, 11], "sourc": 9, "pip": 9, "uninstal": 9, "ast106": [9, 12], "plot": 9, "vol": 9, "enuc": 9, "flame_wave_1000hz_25cm_smallplt203204": 9, "veri": [9, 12], "might": 9, "solut": 9, "accomplish": 9, "xl": 10, "atleast": 10, "due": 10, "17": 10, "won": [10, 12], "present": 10, "warn": [10, 12], "packag": 10, "fine": 10, "sure": [10, 11, 12], "current": [10, 12], "disallow": 10, "line": [10, 12], "pair": 10, "offload": 10, "control": 10, "use_omp_offload": 10, "featur": [10, 12], "frontier_user_guid": [10, 12], "program": 10, "crayp": [10, 12], "accel": [10, 12], "amd": [10, 12], "gfx90a": [10, 12], "crai": [10, 12], "mpich": [10, 12], "rocm": [10, 12], "higher": 10, "issu": 10, "burner": 10, "tabul": 10, "rate": 10, "exhibit": 10, "strang": 10, "slow": 10, "without": [10, 12, 13], "use_hip": [10, 12], "jupyterhub": 11, "document": [11, 12], "extra": 11, "part": 11, "notebook": 11, "wish": 11, "correct": [11, 12], "point": [11, 12, 13], "good": [11, 12], "idea": [11, 12], "my_env": 11, "jupyterlab": 11, "subsequ": 11, "channel": 11, "search": [11, 12], "let": 12, "review": 12, "our": 12, "goal": 12, "necessari": 12, "insight": 12, "better": 12, "decis": 12, "construct": 12, "explain": 12, "expos": 12, "section": 12, "condens": 12, "replac": 12, "21": 12, "reserv": 12, "ram": 12, "bank": 12, "connect": [12, 13], "bu": 12, "commun": 12, "among": 12, "defin": 12, "whole": 12, "structur": 12, "depict": 12, "figur": 12, "extract": 12, "summit_user_guid": 12, "launcher": 12, "jsrun": 12, "minim": 12, "collect": 12, "certain": 12, "oper": 12, "extend": 12, "discuss": 12, "now": 12, "just": [12, 13], "determin": 12, "maximum": 12, "fit": 12, "accord": 12, "summon": 12, "bsub": 12, "descript": 12, "perform": 12, "calcul": 12, "format": 12, "room": 12, "03": 12, "three": 12, "alloc_flag": 12, "smt4": 12, "consid": 12, "smt1": 12, "stand": 12, "interest": 12, "standard": 12, "stream": 12, "insid": 12, "similar": 12, "suppli": 12, "assum": 12, "between": 12, "small": 12, "smoothli": 12, "bug": [12, 13], "unix": 12, "mention": 12, "stdout_to_show": 12, "stderr_to_show": 12, "No": 12, "onc": 12, "grant": 12, "total": 12, "r": 12, "max": 12, "a1": 12, "c1": 12, "g1": 12, "r6": 12, "placehold": 12, "respect": 12, "match": 12, "box": 12, "grid": 12, "biggest": 12, "piec": 12, "32768": 12, "cell": 12, "100": 12, "131072": 12, "524288": 12, "32": 12, "2097152": 12, "7864320": 12, "93": 12, "75": 12, "480": 12, "30408704": 12, "90": 12, "625": 12, "assert": 12, "equival": 12, "impli": 12, "398": 12, "idl": 12, "sweep": 12, "entir": 12, "possibl": 12, "maxim": 12, "life": 12, "easier": 12, "instead": 12, "write": 12, "anoth": 12, "pack": 12, "statement": 12, "luna_script": 12, "luna_output": 12, "luna_sniffing_output": 12, "inputs_luna": 12, "n_re": 12, "n_cpu_cores_per_r": 12, "n_max_res_per_nod": 12, "n_mpi_per_r": 12, "n_gpu_per_r": 12, "downgrad": 12, "kill": 12, "As": 12, "week": 12, "month": 12, "mayb": 12, "year": 12, "come": 12, "salvat": 12, "mandatori": 12, "chkxxxxxxx": 12, "chkxxxxxx": 12, "chkxxxxx": 12, "implement": 12, "append": 12, "minimum": 12, "pick": 12, "ask": 12, "amount": 12, "expir": 12, "pass": 12, "wt": 12, "cleanli": 12, "couldn": 12, "anywher": 12, "ignor": 12, "immedi": 12, "crash": 12, "upon": 12, "sigchld": 12, "sigwinch": 12, "least": 12, "trigger": 12, "other": 12, "event": 12, "launch": [12, 13], "must": 12, "jswait": 12, "save": 12, "bjob": 12, "slightli": 12, "nicer": 12, "jobstat": 12, "lead": 12, "enviro": 12, "simultan": 12, "n_res_1": 12, "n_res2": 12, "give": 12, "avoid": 12, "head": 12, "quiet": 12, "level_": 12, "dev": 12, "null": 12, "warning_tim": 12, "nohead": 12, "action_warning_tim": 12, "lsb_jobid": 12, "chain_submit": 12, "submit_script": 12, "return": 12, "transfer": 12, "15": 12, "cluster": 12, "dtn": 12, "test_hpss": 12, "associ": 12, "summit_hpss": 12, "jobs_script": 12, "togeth": 12, "onto": 12, "probin": 12, "under": 12, "full": 12, "being": 12, "unarchiv": 12, "bulk": 12, "hpss_xfer": 12, "plt00000": 12, "plotfile_dir": 12, "fetch": 12, "unpack": 12, "attempt": 12, "recov": 12, "titan": 12, "help": 12, "polici": 12, "orion": 12, "storag": 12, "05": 12, "closest": 12, "hip": 12, "trento": 12, "nmpi_per_nod": 12, "total_nmpi": 12, "slurm_job_num_nod": 12, "june": 12, "2023": 12, "explicitli": 12, "blob": 12, "warpx": 12, "readthedoc": 12, "io": 12, "en": 12, "queu": 12, "rocgdb": 12, "27": 12, "turn": 12, "startup": 12, "session": 12, "salloc": 12, "mz": 12, "restor": 12, "reload": 12, "hip_enable_deferred_load": 12, "amd_serialize_kernel": 12, "amd_serialize_copi": 12, "amd_log_level": 12, "lot": 12, "debugg": 12, "pagin": 12, "off": 12, "abort": 12, "trace": 12, "interrupt": 12, "bt": 12, "workaround": 12, "prevent": 12, "hang": 12, "fi_mr_cache_monitor": 12, "memhook": 12, "report": 12, "arena": 12, "big": 12, "the_arena_init_s": 12, "grow": 12, "suggest": 12, "larger": 12, "than": 12, "well": 13, "nvcc": 13, "cuda_vers": 13, "cc60": 13, "compile_cuda_path": 13, "usr": 13, "no_device_launch": 13, "around": 13, "cc70": 13, "On": 13, "lab": 13, "browser": 13, "ip": 13, "localhost": 13, "8888": 13, "sunysb": 13, "enter": 13, "password": 13, "window": 13, "web": 13, "prompt": 13, "token": 13, "appear": 13}, "objects": {}, "objtypes": {}, "objnames": {}, "titleterms": {"work": [0, 1, 3, 8], "alcf": 0, "log": 0, "In": 0, "compil": [0, 1, 4, 10], "disk": 0, "queue": 0, "submit": [0, 12], "automat": 0, "restart": 0, "job": [0, 7, 12], "chain": [0, 7, 12], "iac": 1, "ookami": 1, "amrex": [1, 2], "setup": 1, "crai": 1, "gcc": [1, 4], "10": 1, "2": 1, "astrophys": 2, "suit": 2, "astro": 2, "basic": 2, "nersc": [3, 4, 6, 7], "perlmutt": [4, 7], "cuda": 4, "hypr": 4, "archiv": [5, 12], "data": 5, "hpss": [5, 12], "visual": [6, 9], "manag": [7, 12], "filesystem": 7, "olcf": [8, 10, 11, 12], "batch": 9, "Andes": 9, "summit": [10, 12], "frontier": [10, 12], "run": 11, "jupyt": [11, 13], "remot": [11, 13], "from": 11, "creat": 11, "conda": 11, "environ": 11, "architectur": 12, "request": 12, "alloc": 12, "writ": 12, "script": 12, "monitor": 12, "templat": 12, "machin": 12, "detail": 12, "statu": 12, "debug": 12, "troubleshoot": 12, "linux": 13, "workstat": 13, "gpu": 13, "offload": 13, "bender": 13, "groot": 13, "vi": 13}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1, "sphinx": 60}, "alltitles": {"Working at ALCF": [[0, "working-at-alcf"]], "Logging In": [[0, "logging-in"]], "Compiling": [[0, "compiling"]], "Disks": [[0, "disks"]], "Queues": [[0, "queues"]], "Submitting": [[0, "submitting"]], "Automatic Restarting": [[0, "automatic-restarting"]], "Job Chaining": [[0, "job-chaining"], [12, "job-chaining"]], "Working at IACS": [[1, "working-at-iacs"]], "Ookami": [[1, "ookami"]], "AMReX setup": [[1, "amrex-setup"]], "Cray compilers": [[1, "cray-compilers"]], "GCC": [[1, "gcc"]], "GCC 10.2": [[1, "gcc-10-2"]], "AMReX Astrophysics Suite": [[2, "amrex-astrophysics-suite"]], "AMReX Astro basics": [[2, null]], "Working at NERSC": [[3, "working-at-nersc"]], "Compiling at NERSC": [[4, "compiling-at-nersc"]], "Perlmutter": [[4, "perlmutter"], [7, "perlmutter"]], "Compiling with GCC + CUDA": [[4, "compiling-with-gcc-cuda"]], "Hypre": [[4, "hypre"]], "Archiving Data to HPSS": [[5, "archiving-data-to-hpss"]], "Visualization at NERSC": [[6, "visualization-at-nersc"]], "Managing Jobs at NERSC": [[7, "managing-jobs-at-nersc"]], "Filesystems": [[7, "filesystems"]], "Chaining": [[7, "chaining"]], "Working at OLCF": [[8, "working-at-olcf"]], "Batch Visualization on Andes": [[9, "batch-visualization-on-andes"]], "Compiling at OLCF": [[10, "compiling-at-olcf"]], "Summit": [[10, "summit"], [12, "summit"]], "Frontier": [[10, "frontier"], [12, "frontier"]], "Running Jupyter Remotely from OLCF": [[11, "running-jupyter-remotely-from-olcf"]], "Creating a conda environment": [[11, "creating-a-conda-environment"]], "Managing Jobs at OLCF": [[12, "managing-jobs-at-olcf"]], "Summit Architecture:": [[12, "summit-architecture"]], "Requesting Allocation:": [[12, "requesting-allocation"]], "Submitting a Job:": [[12, "submitting-a-job"]], "Writting a Job Script:": [[12, "writting-a-job-script"]], "Monitoring a Job:": [[12, "monitoring-a-job"]], "Script Template:": [[12, "script-template"]], "Chaining jobs": [[12, "chaining-jobs"]], "Archiving to HPSS": [[12, "archiving-to-hpss"]], "Machine details": [[12, "machine-details"]], "Submitting jobs": [[12, "submitting-jobs"]], "Job Status": [[12, "job-status"]], "Debugging": [[12, "debugging"]], "Troubleshooting": [[12, "troubleshooting"]], "Linux Workstations": [[13, "linux-workstations"]], "GPU offloading": [[13, "gpu-offloading"]], "bender": [[13, "bender"]], "groot": [[13, "groot"]], "Remote vis with Jupyter": [[13, "remote-vis-with-jupyter"]]}, "indexentries": {}}) \ No newline at end of file