From a153b54be419ac6863f27ca9100973240341eba2 Mon Sep 17 00:00:00 2001 From: Dusan Jovic Date: Mon, 25 Nov 2024 17:43:40 -0500 Subject: [PATCH] Add mpmd on Gaea --- tests/fv3_conf/fv3_slurm.IN_gaea | 3 +-- tests/rt_utils.sh | 34 ++++++++++++------------ tests/run_test.sh | 44 +++++++++++++++++++++++++++----- tests/tests/cpld_2threads_p8 | 5 +++- 4 files changed, 59 insertions(+), 27 deletions(-) diff --git a/tests/fv3_conf/fv3_slurm.IN_gaea b/tests/fv3_conf/fv3_slurm.IN_gaea index b306f1a1c5..5032c4bd7b 100644 --- a/tests/fv3_conf/fv3_slurm.IN_gaea +++ b/tests/fv3_conf/fv3_slurm.IN_gaea @@ -7,7 +7,6 @@ #SBATCH --clusters=c5 #SBATCH --partition=batch #SBATCH --nodes=@[NODES] -#SBATCH --ntasks-per-node=@[TPN] #SBATCH --time=@[WLCLK] set -eux @@ -40,7 +39,7 @@ if [ "${JOB_SHOULD_FAIL:-NO}" = WHEN_RUNNING ] ; then false fi -srun --label -n @[TASKS] ./fv3.exe +srun --label @[SRUN_CMD_ARGS] echo "Model ended: " `date` echo -n " $( date +%s )," >> job_timestamp.txt diff --git a/tests/rt_utils.sh b/tests/rt_utils.sh index 10428c8417..66b493d592 100755 --- a/tests/rt_utils.sh +++ b/tests/rt_utils.sh @@ -45,7 +45,6 @@ function compute_petbounds_and_tasks_traditional_threading() { unset atm_petlist_bounds ocn_petlist_bounds ice_petlist_bounds wav_petlist_bounds chm_petlist_bounds med_petlist_bounds aqm_petlist_bounds fbh_petlist_bounds local _tasks - local _nodes mpmd_nodes=0 # ATM @@ -53,11 +52,11 @@ function compute_petbounds_and_tasks_traditional_threading() { atm_petlist_bounds="${n} $((n + ATM_tasks - 1))" n=$((n + ATM_tasks)) _tasks=$(( ATM_tasks*atm_omp_num_threads )) - _nodes=$(( _tasks / TPN )) - if (( _nodes * TPN < _tasks )); then - _nodes=$(( _nodes + 1 )) + atm_nodes=$(( _tasks / TPN )) + if (( atm_nodes * TPN < _tasks )); then + atm_nodes=$(( atm_nodes + 1 )) fi - mpmd_nodes=$(( mpmd_nodes + _nodes )) + mpmd_nodes=$(( mpmd_nodes + atm_nodes )) fi # OCN @@ -65,11 +64,11 @@ function compute_petbounds_and_tasks_traditional_threading() { ocn_petlist_bounds="${n} $((n + OCN_tasks - 1))" n=$((n + OCN_tasks)) _tasks=$(( OCN_tasks*ocn_omp_num_threads )) - _nodes=$(( _tasks / TPN )) - if (( _nodes * TPN < _tasks )); then - _nodes=$(( _nodes + 1 )) + ocn_nodes=$(( _tasks / TPN )) + if (( ocn_nodes * TPN < _tasks )); then + ocn_nodes=$(( ocn_nodes + 1 )) fi - mpmd_nodes=$(( mpmd_nodes + _nodes )) + mpmd_nodes=$(( mpmd_nodes + ocn_nodes )) fi # ICE @@ -77,11 +76,11 @@ function compute_petbounds_and_tasks_traditional_threading() { ice_petlist_bounds="${n} $((n + ICE_tasks - 1))" n=$((n + ICE_tasks)) _tasks=$(( ICE_tasks*ice_omp_num_threads )) - _nodes=$(( _tasks / TPN )) - if (( _nodes * TPN < _tasks )); then - _nodes=$(( _nodes + 1 )) + ice_nodes=$(( _tasks / TPN )) + if (( ice_nodes * TPN < _tasks )); then + ice_nodes=$(( ice_nodes + 1 )) fi - mpmd_nodes=$(( mpmd_nodes + _nodes )) + mpmd_nodes=$(( mpmd_nodes + ice_nodes )) fi # WAV @@ -89,11 +88,11 @@ function compute_petbounds_and_tasks_traditional_threading() { wav_petlist_bounds="${n} $((n + WAV_tasks - 1))" n=$((n + WAV_tasks)) _tasks=$(( WAV_tasks*wav_omp_num_threads )) - _nodes=$(( _tasks / TPN )) - if (( _nodes * TPN < _tasks )); then - _nodes=$(( _nodes + 1 )) + wav_nodes=$(( _tasks / TPN )) + if (( wav_nodes * TPN < _tasks )); then + wav_nodes=$(( wav_nodes + 1 )) fi - mpmd_nodes=$(( mpmd_nodes + _nodes )) + mpmd_nodes=$(( mpmd_nodes + wav_nodes )) fi # CHM @@ -121,7 +120,6 @@ function compute_petbounds_and_tasks_traditional_threading() { fi unset _tasks - unset _nodes UFS_tasks=${n} diff --git a/tests/run_test.sh b/tests/run_test.sh index 641d8cdbc2..57a9ccfbe5 100755 --- a/tests/run_test.sh +++ b/tests/run_test.sh @@ -373,6 +373,7 @@ if [[ -n "${coupling_interval_slow_sec+x}" && -n "${coupling_interval_fast_sec+x fi fi +mpmd_tpn=${TPN} TPN=$(( TPN / THRD )) if (( TASKS < TPN )); then TPN=${TASKS} @@ -405,27 +406,27 @@ if [[ ${SCHEDULER} = 'pbs' ]]; then mpiexec_cmd="" if [[ ${ESMF_THREADING} == false && ${MPMD} == true ]]; then if [[ ${ATM_tasks:-0} -gt 0 ]]; then - mpiexec_cmd+=" -n ${ATM_tasks} -ppn $((128/atm_omp_num_threads)) --cpu-bind verbose,depth --depth ${atm_omp_num_threads} --env OMP_NUM_THREADS=${atm_omp_num_threads} ./fv3.exe :" + mpiexec_cmd+=" -n ${ATM_tasks} -ppn $((mpmd_tpn/atm_omp_num_threads)) --cpu-bind verbose,depth --depth ${atm_omp_num_threads} --env OMP_NUM_THREADS=${atm_omp_num_threads} ./fv3.exe :" fi if [[ ${OCN_tasks:-0} -gt 0 ]]; then - mpiexec_cmd+=" -n ${OCN_tasks} -ppn $((128/ocn_omp_num_threads)) --cpu-bind verbose,depth --depth ${ocn_omp_num_threads} --env OMP_NUM_THREADS=${ocn_omp_num_threads} ./fv3.exe :" + mpiexec_cmd+=" -n ${OCN_tasks} -ppn $((mpmd_tpn/ocn_omp_num_threads)) --cpu-bind verbose,depth --depth ${ocn_omp_num_threads} --env OMP_NUM_THREADS=${ocn_omp_num_threads} ./fv3.exe :" fi if [[ ${ICE_tasks:-0} -gt 0 ]]; then - mpiexec_cmd+=" -n ${ICE_tasks} -ppn $((128/ice_omp_num_threads)) --cpu-bind verbose,depth --depth ${ice_omp_num_threads} --env OMP_NUM_THREADS=${ice_omp_num_threads} ./fv3.exe :" + mpiexec_cmd+=" -n ${ICE_tasks} -ppn $((mpmd_tpn/ice_omp_num_threads)) --cpu-bind verbose,depth --depth ${ice_omp_num_threads} --env OMP_NUM_THREADS=${ice_omp_num_threads} ./fv3.exe :" fi if [[ ${WAV_tasks:-0} -gt 0 ]]; then - mpiexec_cmd+=" -n ${WAV_tasks} -ppn $((128/wav_omp_num_threads)) --cpu-bind verbose,depth --depth ${wav_omp_num_threads} --env OMP_NUM_THREADS=${wav_omp_num_threads} ./fv3.exe :" + mpiexec_cmd+=" -n ${WAV_tasks} -ppn $((mpmd_tpn/wav_omp_num_threads)) --cpu-bind verbose,depth --depth ${wav_omp_num_threads} --env OMP_NUM_THREADS=${wav_omp_num_threads} ./fv3.exe :" fi if [[ ${LND_tasks:-0} -gt 0 ]]; then - mpiexec_cmd+=" -n ${LND_tasks} -ppn $((128/lnd_omp_num_threads)) --cpu-bind verbose,depth --depth ${lnd_omp_num_threads} --env OMP_NUM_THREADS=${lnd_omp_num_threads} ./fv3.exe :" + mpiexec_cmd+=" -n ${LND_tasks} -ppn $((mpmd_tpn/lnd_omp_num_threads)) --cpu-bind verbose,depth --depth ${lnd_omp_num_threads} --env OMP_NUM_THREADS=${lnd_omp_num_threads} ./fv3.exe :" fi if [[ ${FBH_tasks:-0} -gt 0 ]]; then - mpiexec_cmd+=" -n ${FBH_tasks} -ppn $((128/fbh_omp_num_threads)) --cpu-bind verbose,depth --depth ${fbh_omp_num_threads} --env OMP_NUM_THREADS=${fbh_omp_num_threads} ./fv3.exe :" + mpiexec_cmd+=" -n ${FBH_tasks} -ppn $((mpmd_tpn/fbh_omp_num_threads)) --cpu-bind verbose,depth --depth ${fbh_omp_num_threads} --env OMP_NUM_THREADS=${fbh_omp_num_threads} ./fv3.exe :" fi # Remove trailing ':' if [[ "${mpiexec_cmd: -1}" == ":" ]]; then @@ -446,6 +447,37 @@ if [[ ${SCHEDULER} = 'pbs' ]]; then exit 1 fi elif [[ ${SCHEDULER} = 'slurm' ]]; then + + srun_cmd="" + if [[ ${ESMF_THREADING} == false && ${MPMD} == true ]]; then + if [[ ${ATM_tasks:-0} -gt 0 ]]; then + srun_cmd+=" --nodes=${atm_nodes} --ntasks=${ATM_tasks} --ntasks-per-node=$((mpmd_tpn/atm_omp_num_threads)) --cpus-per-task=${atm_omp_num_threads} --export=ALL,OMP_NUM_THREADS=${atm_omp_num_threads} ./fv3.exe :" + fi + + if [[ ${OCN_tasks:-0} -gt 0 ]]; then + srun_cmd+=" --nodes=${ocn_nodes} --ntasks=${OCN_tasks} --ntasks-per-node=$((mpmd_tpn/ocn_omp_num_threads)) --cpus-per-task=${ocn_omp_num_threads} --export=ALL,OMP_NUM_THREADS=${ocn_omp_num_threads} ./fv3.exe :" + fi + + if [[ ${ICE_tasks:-0} -gt 0 ]]; then + srun_cmd+=" --nodes=${ice_nodes} --ntasks=${ICE_tasks} --ntasks-per-node=$((mpmd_tpn/ice_omp_num_threads)) --cpus-per-task=${ice_omp_num_threads} --export=ALL,OMP_NUM_THREADS=${ice_omp_num_threads} ./fv3.exe :" + fi + + if [[ ${WAV_tasks:-0} -gt 0 ]]; then + srun_cmd+=" --nodes=${wav_nodes} --ntasks=${WAV_tasks} --ntasks-per-node=$((mpmd_tpn/wav_omp_num_threads)) --cpus-per-task=${wav_omp_num_threads} --export=ALL,OMP_NUM_THREADS=${wav_omp_num_threads} ./fv3.exe :" + fi + + # Remove trailing ':' + if [[ "${srun_cmd: -1}" == ":" ]]; then + srun_cmd="${srun_cmd:0:-1}" + fi + NODES=${mpmd_nodes} + else + srun_cmd+=" --ntasks=${TASKS} --ntasks-per-node=${TPN} --cpus-per-task=${THRD} ./fv3.exe" + fi + + echo "srun_cmd = ${srun_cmd}" + SRUN_CMD_ARGS=${srun_cmd} + if [[ -e ${PATHRT}/fv3_conf/fv3_slurm.IN_${MACHINE_ID} ]]; then atparse < "${PATHRT}/fv3_conf/fv3_slurm.IN_${MACHINE_ID}" > job_card else diff --git a/tests/tests/cpld_2threads_p8 b/tests/tests/cpld_2threads_p8 index 94fff4f50c..5337134321 100644 --- a/tests/tests/cpld_2threads_p8 +++ b/tests/tests/cpld_2threads_p8 @@ -105,5 +105,8 @@ export DO_GWD_OPT_PSL=.true. export DO_GSL_DRAG_SS=.false. ESMF_THREADING=false -MPMD=true THRD=$THRD_cpl_thrd + +if [[ $MACHINE_ID = wcoss2 ]] || [[ $MACHINE_ID = gaea ]]; then + MPMD=true +fi