Skip to content

Commit

Permalink
Merge branch 'main' of github.com:amrex-astro/workflow
Browse files Browse the repository at this point in the history
  • Loading branch information
zingale committed Jan 30, 2024
2 parents 01523af + 643b291 commit 707b6d4
Show file tree
Hide file tree
Showing 29 changed files with 1,425 additions and 679 deletions.
12 changes: 6 additions & 6 deletions .github/workflows/gh-pages.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,18 @@ on: push

jobs:
deploy:
runs-on: ubuntu-18.04
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3

- name: Install pandoc and doxygen
run: |
sudo apt install pandoc doxygen
- name: Setup Python
uses: actions/setup-python@v2
uses: actions/setup-python@v4
with:
python-version: '3.8'
python-version: '3.10'

- name: Upgrade pip
run: |
Expand All @@ -27,7 +27,7 @@ jobs:
run: echo "::set-output name=dir::$(pip cache dir)"

- name: Cache dependencies
uses: actions/cache@v1
uses: actions/cache@v3
with:
path: ${{ steps.pip-cache.outputs.dir }}
key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
Expand All @@ -45,4 +45,4 @@ jobs:
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
publish_dir: ./out
keep_files: true
keep_files: true
61 changes: 61 additions & 0 deletions job_scripts/cori-haswell/cori_haswell.MPI.slurm
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#!/bin/bash
#SBATCH --job-name=f3anw-strang
#SBATCH --account=m3018
#SBATCH --nodes=20
#SBATCH --tasks-per-node=32
#SBATCH --qos=regular
#SBATCH --time=02:00:00
#SBATCH --constraint=haswell

# Cori Haswell has 2 Intel "Haswell" processors each with 16 cores.
# number of nodes = # of MPI * number of threads / 32 (cores / node)

export OMP_NUM_THREADS=1
export MPICH_MAX_THREAD_SAFETY=multiple
export CASTRO_EXEC=./Castro2d.intel.haswell.MPI.ex
export INPUTS=inputs

function find_chk_file {
# find_chk_file takes a single argument -- the wildcard pattern
# for checkpoint files to look through
chk=$1

# find the latest 2 restart files. This way if the latest didn't
# complete we fall back to the previous one.
temp_files=$(find . -maxdepth 1 -name "${chk}" -print | sort | tail -2)
restartFile=""
for f in ${temp_files}
do
# the Header is the last thing written -- if it's there, update the restart file
if [ -f ${f}/Header ]; then
restartFile="${f}"
fi
done

}

# look for 7-digit chk files
find_chk_file "*chk???????"

if [ "${restartFile}" = "" ]; then
# look for 6-digit chk files
find_chk_file "*chk??????"
fi

if [ "${restartFile}" = "" ]; then
# look for 5-digit chk files
find_chk_file "*chk?????"
fi

# restartString will be empty if no chk files are found -- i.e. new run
if [ "${restartFile}" = "" ]; then
restartString=""
else
restartString="amr.restart=${restartFile}"
fi

workdir=`basename ${SLURM_SUBMIT_DIR}`
slack_job_start.py "starting NERSC job: ${workdir} ${restartFile}" @michael

srun ${CASTRO_EXEC} ${INPUTS} ${restartString}

67 changes: 67 additions & 0 deletions job_scripts/frontier/frontier.slurm
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#!/bin/bash
#SBATCH -A AST106
#SBATCH -J subch
#SBATCH -o %x-%j.out
#SBATCH -t 02:00:00
#SBATCH -p batch
# here N is the number of compute nodes
#SBATCH -N 4
#SBATCH --ntasks-per-node=8
#SBATCH --cpus-per-task=7
#SBATCH --gpus-per-task=1
#SBATCH --gpu-bind=closest

EXEC=Castro2d.hip.x86-trento.MPI.HIP.SMPLSDC.ex
INPUTS=inputs_2d.N14.coarse

module load PrgEnv-gnu craype-accel-amd-gfx90a cray-mpich rocm/5.3.0
module amd-mixed/5.3.0

function find_chk_file {
# find_chk_file takes a single argument -- the wildcard pattern
# for checkpoint files to look through
chk=$1

# find the latest 2 restart files. This way if the latest didn't
# complete we fall back to the previous one.
temp_files=$(find . -maxdepth 1 -name "${chk}" -print | sort | tail -2)
restartFile=""
for f in ${temp_files}
do
# the Header is the last thing written -- check if it's there, otherwise,
# fall back to the second-to-last check file written
if [ ! -f ${f}/Header ]; then
restartFile=""
else
restartFile="${f}"
fi
done

}

# look for 7-digit chk files
find_chk_file "*chk???????"

if [ "${restartFile}" = "" ]; then
# look for 6-digit chk files
find_chk_file "*chk??????"
fi

if [ "${restartFile}" = "" ]; then
# look for 5-digit chk files
find_chk_file "*chk?????"
fi

# restartString will be empty if no chk files are found -- i.e. new run
if [ "${restartFile}" = "" ]; then
restartString=""
else
restartString="amr.restart=${restartFile}"
fi

export OMP_NUM_THREADS=1
export NMPI_PER_NODE=8
export TOTAL_NMPI=$(( ${SLURM_JOB_NUM_NODES} * ${NMPI_PER_NODE} ))

srun -n${TOTAL_NMPI} -N${SLURM_JOB_NUM_NODES} --ntasks-per-node=8 --gpus-per-task=1 ./$EXEC $INPUTS ${restartString}

95 changes: 95 additions & 0 deletions job_scripts/perlmutter-cpu/perlmutter_cpu.slurm
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#!/bin/bash
#SBATCH --job-name=perlmutter_script
#SBATCH --account=m3018
#SBATCH --nodes=16
#SBATCH --ntasks-per-node=16
#SBATCH --cpus-per-task=16
#SBATCH --qos=regular
#SBATCH --time=02:00:00
#SBATCH --constraint=cpu

#***********************INSTRUCTIONS*******************************************************
#In order to couple each MPI process with OpenMP threads we have designed the following
#strategy:
#
# 1. First we fix one node in the --qos debug queue and fix a 2**n number of
# --ntasks-per-node, starting with n=0.
#
# 2. Next, we compute the number --cpus-per-task=256/-ntask-per-node. This is
# the number of virtual cores available to each MPI process on each node.
# Each physical core is composed by two virtual cores in a (64+64) where each
# NUMA domain will contain 64 physical cores.
#
# 3. Based on the available number of virtual cores, we obtain the compute of physical
# cores and bind each OpenMP thread to each available physical core, using:
# export OMP_NUM_THREADS. Also, a lower number may also be selected (in case of
# memory shortage); however in principle we want to squeeze all the available resources
# first.
#
# 4. Run the script and check the wall-clock-timestep. In perlmutter I use
# grep Coarse <slurm_output>
#
# 5. Repeat the steps 1-4 until the perfect MPI/OpenMP balance is reached for the
# choice of n.
#
# 6. Compare different amr.max_grid_size until the optimal value is reached. Usually is located
# near half the Level 0 half_size. Furthermore, test several amr.blocking_factor sizes.
#
# 7. Finally, increase the number of nodes from 1, 2, 4, 8 and compare the
# wall-clock time change. If the problem scales correctly, the wall-clock time will
# go down by a factor of ~ 2, as we increase the nodes. However such scaling will break after
# one particular bigger node choice. This is the perfect number of nodes we have to select.
#
#8. Run a chain of jobs using this script and ./chainslurm.sh

export OMP_NUM_THREADS=8
export OMP_PLACES=cores
export OMP_PROC_BIND=spread

#export MPICH_MAX_THREAD_SAFETY=multiple
export CASTRO_EXEC=./Castro2d.gnu.x86-milan.MPI.OMP.ex
export INPUTS_FILE=./inputs_nova_t7

function find_chk_file {
# find_chk_file takes a single argument -- the wildcard pattern
# for checkpoint files to look through
chk=$1

# find the latest 2 restart files. This way if the latest didn't
# complete we fall back to the previous one.
temp_files=$(find . -maxdepth 1 -name "${chk}" -print | sort | tail -2)
restartFile=""
for f in ${temp_files}
do
# the Header is the last thing written -- check if it's there, otherwise,
# fall back to the second-to-last check file written
if [ ! -f ${f}/Header ]; then
restartFile=""
else
restartFile="${f}"
fi
done

}

# look for 7-digit chk files
find_chk_file "*chk???????"

if [ "${restartFile}" = "" ]; then
# look for 6-digit chk files
find_chk_file "*chk??????"
fi

if [ "${restartFile}" = "" ]; then
# look for 5-digit chk files
find_chk_file "*chk?????"
fi

# restartString will be empty if no chk files are found -- i.e. new run
if [ "${restartFile}" != "" ]; then
restartString="amr.restart=${restartFile}"
else
restartString=""
fi

srun -n $((SLURM_NTASKS_PER_NODE * SLURM_NNODES)) -c ${SLURM_CPUS_PER_TASK} --cpu-bind=cores ${CASTRO_EXEC} ${INPUTS_FILE} ${restartString}
36 changes: 30 additions & 6 deletions job_scripts/perlmutter/perlmutter.submit
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#SBATCH --ntasks-per-node=4
#SBATCH --gpus-per-task=1
#SBATCH --gpu-bind=map_gpu:0,1,2,3
#SBATCH --signal=B:URG@2

export CASTRO_EXEC=./Castro2d.gnu.MPI.CUDA.SMPLSDC.ex
export INPUTS=inputs_2d.N14
Expand All @@ -28,11 +29,8 @@ function find_chk_file {
restartFile=""
for f in ${temp_files}
do
# the Header is the last thing written -- check if it's there, otherwise,
# fall back to the second-to-last check file written
if [ ! -f ${f}/Header ]; then
restartFile=""
else
# the Header is the last thing written -- if it's there, update the restart file
if [ -f ${f}/Header ]; then
restartFile="${f}"
fi
done
Expand All @@ -59,8 +57,34 @@ else
restartString="amr.restart=${restartFile}"
fi

# clean up any run management files left over from previous runs
rm -f dump_and_stop

# The `--signal=B:URG@<n>` option tells slurm to send SIGURG to this batch
# script n minutes before the runtime limit, so we can exit gracefully.
function sig_handler {
touch dump_and_stop
# disable this signal handler
trap - URG
echo "BATCH: allocation ending soon; telling Castro to dump a checkpoint and stop"
}
trap sig_handler URG

workdir=`basename ${SLURM_SUBMIT_DIR}`
slack_job_start.py "starting NERSC job: ${workdir} ${restartFile}" @michael


srun -n 64 ${CASTRO_EXEC} ${INPUTS} ${restartString}
# execute srun in the background then use the builtin wait so the shell can
# handle the signal
srun -n 64 ${CASTRO_EXEC} ${INPUTS} ${restartString} &
pid=$!
wait $pid
ret=$?

if (( ret == 128 + 23 )); then
# received SIGURG, keep waiting
wait $pid
ret=$?
fi

exit $ret
56 changes: 56 additions & 0 deletions job_scripts/polaris/chainqsub.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/bin/sh -f

if [ ! "$1" ]; then
echo "usage: chainqsub.sh jobid number script"
echo " set jobid -1 for no initial dependency"
exit -1
fi

if [ ! "$2" ]; then
echo "usage: chainqsub.sh jobid number script"
echo " set jobid -1 for no initial dependency"
exit -1
fi

if [ ! "$3" ]; then
echo "usage: chainqsub.sh jobid number script"
echo " set jobid -1 for no initial dependency"
exit -1
fi


oldjob=$1
numjobs=$2
script=$3

if [ $numjobs -gt "20" ]; then
echo "too many jobs requested"
exit -1
fi

firstcount=1

if [ $oldjob -eq "-1" ]
then
echo chaining $numjobs jobs

echo starting job 1 with no dependency
aout=$(qsub ${script})
echo " " jobid: $aout
echo " "
oldjob=$aout
firstcount=2
sleep 3
else
echo chaining $numjobs jobs starting with $oldjob
fi

for count in $(seq $firstcount 1 $numjobs)
do
echo starting job $count to depend on $oldjob
aout=$(qsub -W depend=afterany:${oldjob} ${script})
echo " " jobid: $aout
echo " "
oldjob=$aout
sleep 2
done
Loading

0 comments on commit 707b6d4

Please sign in to comment.