Skip to content

Commit

Permalink
Test mpmd in rt
Browse files Browse the repository at this point in the history
  • Loading branch information
DusanJovic-NOAA committed Nov 26, 2024
1 parent c0367fd commit 184df06
Show file tree
Hide file tree
Showing 9 changed files with 240 additions and 22 deletions.
3 changes: 3 additions & 0 deletions tests/default_vars.sh
Original file line number Diff line number Diff line change
Expand Up @@ -422,6 +422,9 @@ export esmf_logkind="ESMF_LOGKIND_MULTI"
export DumpFields="false"
export MED_history_n=1000000

export ESMF_THREADING=true
export MPMD=false

export_fv3_v16 ()
{
# Add support for v16 test cases. This section
Expand Down
3 changes: 2 additions & 1 deletion tests/fv3_conf/fv3_qsub.IN_acorn
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,11 @@
#PBS -l walltime=00:@[WLCLK]:00

set -eux
echo -n " $( date +%s )," > job_timestamp.txt

cd $PBS_O_WORKDIR

echo -n " $( date +%s )," > job_timestamp.txt

set +x
module use $PWD/modulefiles
module load modules.fv3
Expand Down
7 changes: 4 additions & 3 deletions tests/fv3_conf/fv3_qsub.IN_wcoss2
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,16 @@
#PBS -N @[JBNME]
#PBS -A @[ACCNR]
#PBS -q @[QUEUE]
#PBS -l place=vscatter,select=@[NODES]:ncpus=@[TPN]:mpiprocs=@[TPN]:mem=500G
#PBS -l place=vscatter,select=@[NODES]:ncpus=128:mem=500G
#PBS -l place=excl
#PBS -l walltime=00:@[WLCLK]:00

set -eux
echo -n " $( date +%s )," > job_timestamp.txt

cd $PBS_O_WORKDIR

echo -n " $( date +%s )," > job_timestamp.txt

set +x
module use $PWD/modulefiles
module load modules.fv3
Expand All @@ -37,7 +38,7 @@ if [ "${JOB_SHOULD_FAIL:-NO}" = WHEN_RUNNING ] ; then
false
fi

mpiexec -n @[TASKS] -ppn @[TPN] -depth @[THRD] ./fv3.exe
mpiexec @[MPIEXEC_CMD_ARGS]

echo "Model ended: " `date`
echo -n " $( date +%s )," >> job_timestamp.txt
3 changes: 1 addition & 2 deletions tests/fv3_conf/fv3_slurm.IN_gaea
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
#SBATCH --clusters=c5
#SBATCH --partition=batch
#SBATCH --nodes=@[NODES]
#SBATCH --ntasks-per-node=@[TPN]
#SBATCH --time=@[WLCLK]

set -eux
Expand Down Expand Up @@ -40,7 +39,7 @@ if [ "${JOB_SHOULD_FAIL:-NO}" = WHEN_RUNNING ] ; then
false
fi

srun --label -n @[TASKS] ./fv3.exe
srun --label @[SRUN_CMD_ARGS]

echo "Model ended: " `date`
echo -n " $( date +%s )," >> job_timestamp.txt
2 changes: 1 addition & 1 deletion tests/parm/ufs.configure.s2swa_fast_esmf.IN
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

# ESMF #
logKindFlag: ESMF_LOGKIND_MULTI
globalResourceControl: true
globalResourceControl: @[ESMF_THREADING]

# EARTH #
EARTH_component_list: MED ATM CHM OCN ICE WAV
Expand Down
6 changes: 5 additions & 1 deletion tests/rt.sh
Original file line number Diff line number Diff line change
Expand Up @@ -1266,7 +1266,11 @@ EOF
(
source "${PATHRT}/tests/${TEST_NAME}"

compute_petbounds_and_tasks
if [[ ${ESMF_THREADING} == true ]]; then
compute_petbounds_and_tasks_esmf_threading
else
compute_petbounds_and_tasks_traditional_threading
fi

TPN=$(( TPN / THRD ))
NODES=$(( TASKS / TPN ))
Expand Down
126 changes: 123 additions & 3 deletions tests/rt_utils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,127 @@ redirect_out_err() {
# redirect_out_err command will return non-zero if "$@" or tee return non-zero.
}

function compute_petbounds_and_tasks() {
function compute_petbounds_and_tasks_traditional_threading() {

# each test MUST define ${COMPONENT}_tasks variable for all components it is using
# and MUST NOT define those that it's not using or set the value to 0.

# ATM is a special case since it is running on the sum of compute and io tasks.
# CHM component and mediator are running on ATM compute tasks only.

if [[ ${DATM_CDEPS} = 'false' ]]; then
if [[ ${ATM_compute_tasks:-0} -eq 0 ]]; then
ATM_compute_tasks=$((INPES * JNPES * NTILES))
fi
if [[ ${QUILTING} = '.true.' ]]; then
ATM_io_tasks=$((WRITE_GROUP * WRTTASK_PER_GROUP))
else
ATM_io_tasks=0
fi
ATM_tasks=$((ATM_compute_tasks + ATM_io_tasks))
fi

local n=0
unset atm_petlist_bounds ocn_petlist_bounds ice_petlist_bounds wav_petlist_bounds chm_petlist_bounds med_petlist_bounds aqm_petlist_bounds fbh_petlist_bounds

local _tasks
mpmd_nodes=0

# ATM
if [[ ${ATM_tasks:-0} -gt 0 ]]; then
atm_petlist_bounds="${n} $((n + ATM_tasks - 1))"
n=$((n + ATM_tasks))
_tasks=$(( ATM_tasks*atm_omp_num_threads ))
atm_nodes=$(( _tasks / TPN ))
if (( atm_nodes * TPN < _tasks )); then
atm_nodes=$(( atm_nodes + 1 ))
fi
mpmd_nodes=$(( mpmd_nodes + atm_nodes ))
fi

# OCN
if [[ ${OCN_tasks:-0} -gt 0 ]]; then
ocn_petlist_bounds="${n} $((n + OCN_tasks - 1))"
n=$((n + OCN_tasks))
_tasks=$(( OCN_tasks*ocn_omp_num_threads ))
ocn_nodes=$(( _tasks / TPN ))
if (( ocn_nodes * TPN < _tasks )); then
ocn_nodes=$(( ocn_nodes + 1 ))
fi
mpmd_nodes=$(( mpmd_nodes + ocn_nodes ))
fi

# ICE
if [[ ${ICE_tasks:-0} -gt 0 ]]; then
ice_petlist_bounds="${n} $((n + ICE_tasks - 1))"
n=$((n + ICE_tasks))
_tasks=$(( ICE_tasks*ice_omp_num_threads ))
ice_nodes=$(( _tasks / TPN ))
if (( ice_nodes * TPN < _tasks )); then
ice_nodes=$(( ice_nodes + 1 ))
fi
mpmd_nodes=$(( mpmd_nodes + ice_nodes ))
fi

# WAV
if [[ ${WAV_tasks:-0} -gt 0 ]]; then
wav_petlist_bounds="${n} $((n + WAV_tasks - 1))"
n=$((n + WAV_tasks))
_tasks=$(( WAV_tasks*wav_omp_num_threads ))
wav_nodes=$(( _tasks / TPN ))
if (( wav_nodes * TPN < _tasks )); then
wav_nodes=$(( wav_nodes + 1 ))
fi
mpmd_nodes=$(( mpmd_nodes + wav_nodes ))
fi

# CHM
chm_petlist_bounds="0 $((ATM_compute_tasks - 1))"

# MED
med_petlist_bounds="0 $((ATM_compute_tasks - 1))"

# AQM
aqm_petlist_bounds="0 $((ATM_compute_tasks - 1))"

# LND
if [[ ${lnd_model:-} = "lm4" ]]; then
# set lnd_petlist_bounds to be same as ATM_compute_tasks
lnd_petlist_bounds="0 $((ATM_compute_tasks - 1))"
elif [[ ${LND_tasks:-0} -gt 0 ]]; then # noahmp component or other
lnd_petlist_bounds="${n} $((n + LND_tasks - 1))"
n=$((n + LND_tasks))
fi

# FBH
if [[ ${FBH_tasks:-0} -gt 0 ]]; then
fbh_petlist_bounds="${n} $((n + FBH_tasks - 1))"
n=$((n + FBH_tasks))
fi

unset _tasks

UFS_tasks=${n}

if [[ ${RTVERBOSE} == true ]]; then
echo "ATM_petlist_bounds: ${atm_petlist_bounds:-}"
echo "OCN_petlist_bounds: ${ocn_petlist_bounds:-}"
echo "ICE_petlist_bounds: ${ice_petlist_bounds:-}"
echo "WAV_petlist_bounds: ${wav_petlist_bounds:-}"
echo "CHM_petlist_bounds: ${chm_petlist_bounds:-}"
echo "MED_petlist_bounds: ${med_petlist_bounds:-}"
echo "AQM_petlist_bounds: ${aqm_petlist_bounds:-}"
echo "LND_petlist_bounds: ${lnd_petlist_bounds:-}"
echo "FBH_petlist_bounds: ${fbh_petlist_bounds:-}"
echo "UFS_tasks : ${UFS_tasks:-}"
echo "mpmd_nodes : ${mpmd_nodes:-}"
fi

# TASKS is now set to UFS_TASKS
export TASKS=${UFS_tasks}
}

function compute_petbounds_and_tasks_esmf_threading() {

# each test MUST define ${COMPONENT}_tasks variable for all components it is using
# and MUST NOT define those that it's not using or set the value to 0.
Expand Down Expand Up @@ -82,8 +202,8 @@ function compute_petbounds_and_tasks() {
if [[ ${lnd_model:-} = "lm4" ]]; then
# set lnd_petlist_bounds to be same as ATM_compute_tasks
lnd_petlist_bounds="0 $((ATM_compute_tasks - 1))"
elif [[ ${LND_tasks:-0} -gt 0 ]]; then # noahmp component or other
LND_tasks=$((LND_tasks * lnd_omp_num_threads))
elif [[ ${LND_tasks:-0} -gt 0 ]]; then # noahmp component or other
LND_tasks=$((LND_tasks * lnd_omp_num_threads))
lnd_petlist_bounds="${n} $((n + LND_tasks - 1))"
n=$((n + LND_tasks))
fi
Expand Down
105 changes: 94 additions & 11 deletions tests/run_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -163,10 +163,19 @@ else
exit 1
fi

compute_petbounds_and_tasks
if [[ ${ESMF_THREADING} == true ]]; then
compute_petbounds_and_tasks_esmf_threading
else
compute_petbounds_and_tasks_traditional_threading
fi

if [[ -f ${PATHRT}/parm/${UFS_CONFIGURE} ]]; then
atparse < "${PATHRT}/parm/${UFS_CONFIGURE}" > ufs.configure
(
atparse < "${PATHRT}/parm/${UFS_CONFIGURE}" > ufs.configure
if [[ ${ESMF_THREADING} != true ]]; then
sed -i -e "/_omp_num_threads:/d" ufs.configure
fi
)
else
echo "Cannot find file ${UFS_CONFIGURE} set by variable UFS_CONFIGURE"
exit 1
Expand Down Expand Up @@ -240,7 +249,7 @@ fi
if [[ "Q${FIELD_TABLE:-}" != Q ]]; then
cp "${PATHRT}/parm/field_table/${FIELD_TABLE}" field_table
fi

# fix files
if [[ ${FV3} == true ]]; then
cp "${INPUTDATA_ROOT}"/FV3_fix/*.txt .
Expand Down Expand Up @@ -355,6 +364,7 @@ if [[ -n "${coupling_interval_slow_sec+x}" && -n "${coupling_interval_fast_sec+x
fi
fi

mpmd_tpn=${TPN}
TPN=$(( TPN / THRD ))
if (( TASKS < TPN )); then
TPN=${TASKS}
Expand All @@ -367,25 +377,98 @@ if (( NODES * TPN < TASKS )); then
fi
export NODES

UFS_TASKS=${TASKS}
TASKS=$(( NODES * TPN ))
export TASKS
if [[ ${ESMF_THREADING} == true ]]; then
UFS_TASKS=${TASKS}
TASKS=$(( NODES * TPN ))
export TASKS

PPN=$(( UFS_TASKS / NODES ))
if (( UFS_TASKS - ( PPN * NODES ) > 0 )); then
PPN=$((PPN + 1))
PPN=$(( UFS_TASKS / NODES ))
if (( UFS_TASKS - ( PPN * NODES ) > 0 )); then
PPN=$((PPN + 1))
fi
export PPN
export UFS_TASKS
else
PPN=${TPN}
fi
export PPN
export UFS_TASKS

if [[ ${SCHEDULER} = 'pbs' ]]; then

mpiexec_cmd=""
if [[ ${ESMF_THREADING} == false && ${MPMD} == true ]]; then
if [[ ${ATM_tasks:-0} -gt 0 ]]; then
mpiexec_cmd+=" -n ${ATM_tasks} -ppn $((mpmd_tpn/atm_omp_num_threads)) --cpu-bind core --depth ${atm_omp_num_threads} --env OMP_NUM_THREADS=${atm_omp_num_threads} ./fv3.exe :"
fi

if [[ ${OCN_tasks:-0} -gt 0 ]]; then
mpiexec_cmd+=" -n ${OCN_tasks} -ppn $((mpmd_tpn/ocn_omp_num_threads)) --cpu-bind core --depth ${ocn_omp_num_threads} --env OMP_NUM_THREADS=${ocn_omp_num_threads} ./fv3.exe :"
fi

if [[ ${ICE_tasks:-0} -gt 0 ]]; then
mpiexec_cmd+=" -n ${ICE_tasks} -ppn $((mpmd_tpn/ice_omp_num_threads)) --cpu-bind core --depth ${ice_omp_num_threads} --env OMP_NUM_THREADS=${ice_omp_num_threads} ./fv3.exe :"
fi

if [[ ${WAV_tasks:-0} -gt 0 ]]; then
mpiexec_cmd+=" -n ${WAV_tasks} -ppn $((mpmd_tpn/wav_omp_num_threads)) --cpu-bind core --depth ${wav_omp_num_threads} --env OMP_NUM_THREADS=${wav_omp_num_threads} ./fv3.exe :"
fi

if [[ ${LND_tasks:-0} -gt 0 ]]; then
mpiexec_cmd+=" -n ${LND_tasks} -ppn $((mpmd_tpn/lnd_omp_num_threads)) --cpu-bind core --depth ${lnd_omp_num_threads} --env OMP_NUM_THREADS=${lnd_omp_num_threads} ./fv3.exe :"
fi

if [[ ${FBH_tasks:-0} -gt 0 ]]; then
mpiexec_cmd+=" -n ${FBH_tasks} -ppn $((mpmd_tpn/fbh_omp_num_threads)) --cpu-bind core --depth ${fbh_omp_num_threads} --env OMP_NUM_THREADS=${fbh_omp_num_threads} ./fv3.exe :"
fi
# Remove trailing ':'
if [[ "${mpiexec_cmd: -1}" == ":" ]]; then
mpiexec_cmd="${mpiexec_cmd:0:-1}"
fi
NODES=${mpmd_nodes}
else
mpiexec_cmd+="-n ${TASKS} -ppn ${TPN} --cpu-bind core --depth ${THRD} ./fv3.exe"
fi

echo "mpiexec_cmd = ${mpiexec_cmd}"
MPIEXEC_CMD_ARGS=${mpiexec_cmd}

if [[ -e ${PATHRT}/fv3_conf/fv3_qsub.IN_${MACHINE_ID} ]]; then
atparse < "${PATHRT}/fv3_conf/fv3_qsub.IN_${MACHINE_ID}" > job_card
else
echo "Looking for fv3_conf/fv3_qsub.IN_${MACHINE_ID} but it is not found. Exiting"
exit 1
fi
elif [[ ${SCHEDULER} = 'slurm' ]]; then

srun_cmd=""
if [[ ${ESMF_THREADING} == false && ${MPMD} == true ]]; then
if [[ ${ATM_tasks:-0} -gt 0 ]]; then
srun_cmd+=" --nodes=${atm_nodes} --ntasks=${ATM_tasks} --ntasks-per-node=$((mpmd_tpn/atm_omp_num_threads)) --cpus-per-task=${atm_omp_num_threads} --export=ALL,OMP_NUM_THREADS=${atm_omp_num_threads} ./fv3.exe :"
fi

if [[ ${OCN_tasks:-0} -gt 0 ]]; then
srun_cmd+=" --nodes=${ocn_nodes} --ntasks=${OCN_tasks} --ntasks-per-node=$((mpmd_tpn/ocn_omp_num_threads)) --cpus-per-task=${ocn_omp_num_threads} --export=ALL,OMP_NUM_THREADS=${ocn_omp_num_threads} ./fv3.exe :"
fi

if [[ ${ICE_tasks:-0} -gt 0 ]]; then
srun_cmd+=" --nodes=${ice_nodes} --ntasks=${ICE_tasks} --ntasks-per-node=$((mpmd_tpn/ice_omp_num_threads)) --cpus-per-task=${ice_omp_num_threads} --export=ALL,OMP_NUM_THREADS=${ice_omp_num_threads} ./fv3.exe :"
fi

if [[ ${WAV_tasks:-0} -gt 0 ]]; then
srun_cmd+=" --nodes=${wav_nodes} --ntasks=${WAV_tasks} --ntasks-per-node=$((mpmd_tpn/wav_omp_num_threads)) --cpus-per-task=${wav_omp_num_threads} --export=ALL,OMP_NUM_THREADS=${wav_omp_num_threads} ./fv3.exe :"
fi

# Remove trailing ':'
if [[ "${srun_cmd: -1}" == ":" ]]; then
srun_cmd="${srun_cmd:0:-1}"
fi
NODES=${mpmd_nodes}
else
srun_cmd+=" --ntasks=${TASKS} --ntasks-per-node=${TPN} --cpus-per-task=${THRD} ./fv3.exe"
fi

echo "srun_cmd = ${srun_cmd}"
SRUN_CMD_ARGS=${srun_cmd}

if [[ -e ${PATHRT}/fv3_conf/fv3_slurm.IN_${MACHINE_ID} ]]; then
atparse < "${PATHRT}/fv3_conf/fv3_slurm.IN_${MACHINE_ID}" > job_card
else
Expand Down
7 changes: 7 additions & 0 deletions tests/tests/cpld_2threads_p8
Original file line number Diff line number Diff line change
Expand Up @@ -103,3 +103,10 @@ export N_SPLIT=5
# HR4 GFSv17 GWD update
export DO_GWD_OPT_PSL=.true.
export DO_GSL_DRAG_SS=.false.

ESMF_THREADING=false
THRD=$THRD_cpl_thrd

if [[ $MACHINE_ID = wcoss2 ]] || [[ $MACHINE_ID = gaea ]]; then
MPMD=true
fi

0 comments on commit 184df06

Please sign in to comment.