Skip to content

Commit

Permalink
Add mpmd on Gaea
Browse files Browse the repository at this point in the history
  • Loading branch information
DusanJovic-NOAA committed Nov 25, 2024
1 parent f970da0 commit a153b54
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 27 deletions.
3 changes: 1 addition & 2 deletions tests/fv3_conf/fv3_slurm.IN_gaea
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
#SBATCH --clusters=c5
#SBATCH --partition=batch
#SBATCH --nodes=@[NODES]
#SBATCH --ntasks-per-node=@[TPN]
#SBATCH --time=@[WLCLK]

set -eux
Expand Down Expand Up @@ -40,7 +39,7 @@ if [ "${JOB_SHOULD_FAIL:-NO}" = WHEN_RUNNING ] ; then
false
fi

srun --label -n @[TASKS] ./fv3.exe
srun --label @[SRUN_CMD_ARGS]

echo "Model ended: " `date`
echo -n " $( date +%s )," >> job_timestamp.txt
34 changes: 16 additions & 18 deletions tests/rt_utils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,55 +45,54 @@ function compute_petbounds_and_tasks_traditional_threading() {
unset atm_petlist_bounds ocn_petlist_bounds ice_petlist_bounds wav_petlist_bounds chm_petlist_bounds med_petlist_bounds aqm_petlist_bounds fbh_petlist_bounds

local _tasks
local _nodes
mpmd_nodes=0

# ATM
if [[ ${ATM_tasks:-0} -gt 0 ]]; then
atm_petlist_bounds="${n} $((n + ATM_tasks - 1))"
n=$((n + ATM_tasks))
_tasks=$(( ATM_tasks*atm_omp_num_threads ))
_nodes=$(( _tasks / TPN ))
if (( _nodes * TPN < _tasks )); then
_nodes=$(( _nodes + 1 ))
atm_nodes=$(( _tasks / TPN ))
if (( atm_nodes * TPN < _tasks )); then
atm_nodes=$(( atm_nodes + 1 ))
fi
mpmd_nodes=$(( mpmd_nodes + _nodes ))
mpmd_nodes=$(( mpmd_nodes + atm_nodes ))
fi

# OCN
if [[ ${OCN_tasks:-0} -gt 0 ]]; then
ocn_petlist_bounds="${n} $((n + OCN_tasks - 1))"
n=$((n + OCN_tasks))
_tasks=$(( OCN_tasks*ocn_omp_num_threads ))
_nodes=$(( _tasks / TPN ))
if (( _nodes * TPN < _tasks )); then
_nodes=$(( _nodes + 1 ))
ocn_nodes=$(( _tasks / TPN ))
if (( ocn_nodes * TPN < _tasks )); then
ocn_nodes=$(( ocn_nodes + 1 ))
fi
mpmd_nodes=$(( mpmd_nodes + _nodes ))
mpmd_nodes=$(( mpmd_nodes + ocn_nodes ))
fi

# ICE
if [[ ${ICE_tasks:-0} -gt 0 ]]; then
ice_petlist_bounds="${n} $((n + ICE_tasks - 1))"
n=$((n + ICE_tasks))
_tasks=$(( ICE_tasks*ice_omp_num_threads ))
_nodes=$(( _tasks / TPN ))
if (( _nodes * TPN < _tasks )); then
_nodes=$(( _nodes + 1 ))
ice_nodes=$(( _tasks / TPN ))
if (( ice_nodes * TPN < _tasks )); then
ice_nodes=$(( ice_nodes + 1 ))
fi
mpmd_nodes=$(( mpmd_nodes + _nodes ))
mpmd_nodes=$(( mpmd_nodes + ice_nodes ))
fi

# WAV
if [[ ${WAV_tasks:-0} -gt 0 ]]; then
wav_petlist_bounds="${n} $((n + WAV_tasks - 1))"
n=$((n + WAV_tasks))
_tasks=$(( WAV_tasks*wav_omp_num_threads ))
_nodes=$(( _tasks / TPN ))
if (( _nodes * TPN < _tasks )); then
_nodes=$(( _nodes + 1 ))
wav_nodes=$(( _tasks / TPN ))
if (( wav_nodes * TPN < _tasks )); then
wav_nodes=$(( wav_nodes + 1 ))
fi
mpmd_nodes=$(( mpmd_nodes + _nodes ))
mpmd_nodes=$(( mpmd_nodes + wav_nodes ))
fi

# CHM
Expand Down Expand Up @@ -121,7 +120,6 @@ function compute_petbounds_and_tasks_traditional_threading() {
fi

unset _tasks
unset _nodes

UFS_tasks=${n}

Expand Down
44 changes: 38 additions & 6 deletions tests/run_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,7 @@ if [[ -n "${coupling_interval_slow_sec+x}" && -n "${coupling_interval_fast_sec+x
fi
fi

mpmd_tpn=${TPN}
TPN=$(( TPN / THRD ))
if (( TASKS < TPN )); then
TPN=${TASKS}
Expand Down Expand Up @@ -405,27 +406,27 @@ if [[ ${SCHEDULER} = 'pbs' ]]; then
mpiexec_cmd=""
if [[ ${ESMF_THREADING} == false && ${MPMD} == true ]]; then
if [[ ${ATM_tasks:-0} -gt 0 ]]; then
mpiexec_cmd+=" -n ${ATM_tasks} -ppn $((128/atm_omp_num_threads)) --cpu-bind verbose,depth --depth ${atm_omp_num_threads} --env OMP_NUM_THREADS=${atm_omp_num_threads} ./fv3.exe :"
mpiexec_cmd+=" -n ${ATM_tasks} -ppn $((mpmd_tpn/atm_omp_num_threads)) --cpu-bind verbose,depth --depth ${atm_omp_num_threads} --env OMP_NUM_THREADS=${atm_omp_num_threads} ./fv3.exe :"
fi

if [[ ${OCN_tasks:-0} -gt 0 ]]; then
mpiexec_cmd+=" -n ${OCN_tasks} -ppn $((128/ocn_omp_num_threads)) --cpu-bind verbose,depth --depth ${ocn_omp_num_threads} --env OMP_NUM_THREADS=${ocn_omp_num_threads} ./fv3.exe :"
mpiexec_cmd+=" -n ${OCN_tasks} -ppn $((mpmd_tpn/ocn_omp_num_threads)) --cpu-bind verbose,depth --depth ${ocn_omp_num_threads} --env OMP_NUM_THREADS=${ocn_omp_num_threads} ./fv3.exe :"
fi

if [[ ${ICE_tasks:-0} -gt 0 ]]; then
mpiexec_cmd+=" -n ${ICE_tasks} -ppn $((128/ice_omp_num_threads)) --cpu-bind verbose,depth --depth ${ice_omp_num_threads} --env OMP_NUM_THREADS=${ice_omp_num_threads} ./fv3.exe :"
mpiexec_cmd+=" -n ${ICE_tasks} -ppn $((mpmd_tpn/ice_omp_num_threads)) --cpu-bind verbose,depth --depth ${ice_omp_num_threads} --env OMP_NUM_THREADS=${ice_omp_num_threads} ./fv3.exe :"
fi

if [[ ${WAV_tasks:-0} -gt 0 ]]; then
mpiexec_cmd+=" -n ${WAV_tasks} -ppn $((128/wav_omp_num_threads)) --cpu-bind verbose,depth --depth ${wav_omp_num_threads} --env OMP_NUM_THREADS=${wav_omp_num_threads} ./fv3.exe :"
mpiexec_cmd+=" -n ${WAV_tasks} -ppn $((mpmd_tpn/wav_omp_num_threads)) --cpu-bind verbose,depth --depth ${wav_omp_num_threads} --env OMP_NUM_THREADS=${wav_omp_num_threads} ./fv3.exe :"
fi

if [[ ${LND_tasks:-0} -gt 0 ]]; then
mpiexec_cmd+=" -n ${LND_tasks} -ppn $((128/lnd_omp_num_threads)) --cpu-bind verbose,depth --depth ${lnd_omp_num_threads} --env OMP_NUM_THREADS=${lnd_omp_num_threads} ./fv3.exe :"
mpiexec_cmd+=" -n ${LND_tasks} -ppn $((mpmd_tpn/lnd_omp_num_threads)) --cpu-bind verbose,depth --depth ${lnd_omp_num_threads} --env OMP_NUM_THREADS=${lnd_omp_num_threads} ./fv3.exe :"
fi

if [[ ${FBH_tasks:-0} -gt 0 ]]; then
mpiexec_cmd+=" -n ${FBH_tasks} -ppn $((128/fbh_omp_num_threads)) --cpu-bind verbose,depth --depth ${fbh_omp_num_threads} --env OMP_NUM_THREADS=${fbh_omp_num_threads} ./fv3.exe :"
mpiexec_cmd+=" -n ${FBH_tasks} -ppn $((mpmd_tpn/fbh_omp_num_threads)) --cpu-bind verbose,depth --depth ${fbh_omp_num_threads} --env OMP_NUM_THREADS=${fbh_omp_num_threads} ./fv3.exe :"
fi
# Remove trailing ':'
if [[ "${mpiexec_cmd: -1}" == ":" ]]; then
Expand All @@ -446,6 +447,37 @@ if [[ ${SCHEDULER} = 'pbs' ]]; then
exit 1
fi
elif [[ ${SCHEDULER} = 'slurm' ]]; then

srun_cmd=""
if [[ ${ESMF_THREADING} == false && ${MPMD} == true ]]; then
if [[ ${ATM_tasks:-0} -gt 0 ]]; then
srun_cmd+=" --nodes=${atm_nodes} --ntasks=${ATM_tasks} --ntasks-per-node=$((mpmd_tpn/atm_omp_num_threads)) --cpus-per-task=${atm_omp_num_threads} --export=ALL,OMP_NUM_THREADS=${atm_omp_num_threads} ./fv3.exe :"
fi

if [[ ${OCN_tasks:-0} -gt 0 ]]; then
srun_cmd+=" --nodes=${ocn_nodes} --ntasks=${OCN_tasks} --ntasks-per-node=$((mpmd_tpn/ocn_omp_num_threads)) --cpus-per-task=${ocn_omp_num_threads} --export=ALL,OMP_NUM_THREADS=${ocn_omp_num_threads} ./fv3.exe :"
fi

if [[ ${ICE_tasks:-0} -gt 0 ]]; then
srun_cmd+=" --nodes=${ice_nodes} --ntasks=${ICE_tasks} --ntasks-per-node=$((mpmd_tpn/ice_omp_num_threads)) --cpus-per-task=${ice_omp_num_threads} --export=ALL,OMP_NUM_THREADS=${ice_omp_num_threads} ./fv3.exe :"
fi

if [[ ${WAV_tasks:-0} -gt 0 ]]; then
srun_cmd+=" --nodes=${wav_nodes} --ntasks=${WAV_tasks} --ntasks-per-node=$((mpmd_tpn/wav_omp_num_threads)) --cpus-per-task=${wav_omp_num_threads} --export=ALL,OMP_NUM_THREADS=${wav_omp_num_threads} ./fv3.exe :"
fi

# Remove trailing ':'
if [[ "${srun_cmd: -1}" == ":" ]]; then
srun_cmd="${srun_cmd:0:-1}"
fi
NODES=${mpmd_nodes}
else
srun_cmd+=" --ntasks=${TASKS} --ntasks-per-node=${TPN} --cpus-per-task=${THRD} ./fv3.exe"
fi

echo "srun_cmd = ${srun_cmd}"
SRUN_CMD_ARGS=${srun_cmd}

if [[ -e ${PATHRT}/fv3_conf/fv3_slurm.IN_${MACHINE_ID} ]]; then
atparse < "${PATHRT}/fv3_conf/fv3_slurm.IN_${MACHINE_ID}" > job_card
else
Expand Down
5 changes: 4 additions & 1 deletion tests/tests/cpld_2threads_p8
Original file line number Diff line number Diff line change
Expand Up @@ -105,5 +105,8 @@ export DO_GWD_OPT_PSL=.true.
export DO_GSL_DRAG_SS=.false.

ESMF_THREADING=false
MPMD=true
THRD=$THRD_cpl_thrd

if [[ $MACHINE_ID = wcoss2 ]] || [[ $MACHINE_ID = gaea ]]; then
MPMD=true
fi

0 comments on commit a153b54

Please sign in to comment.