From a6f734990fa1c151c8f23e8fb2d84f8e3109ccd8 Mon Sep 17 00:00:00 2001 From: Loris Lucido Date: Mon, 26 Feb 2024 09:56:29 +0100 Subject: [PATCH 01/33] add timers in ogstm --- GeneralCmake.cmake | 3 +- application/ogstm_main_caller.f90 | 2 ++ compilers/machine_modules/leonardo.nvhpc | 8 +++++ src/BIO/trcbio.f90 | 16 ++++++++- src/BIO/trcsms.f90 | 3 ++ src/General/step.f90 | 44 ++++++++++++++++++++++++ 6 files changed, 74 insertions(+), 2 deletions(-) diff --git a/GeneralCmake.cmake b/GeneralCmake.cmake index a61d5aad..76296a45 100644 --- a/GeneralCmake.cmake +++ b/GeneralCmake.cmake @@ -56,6 +56,7 @@ link_directories(${MPI_Fortran_LIBRARIES}) include_directories(${BFM_INCLUDES}) include_directories(${NETCDF_INCLUDES_C}) include_directories(${NETCDFF_INCLUDES_F90}) +include_directories($ENV{SIMPLE_TIMER_INCLUDE_DIR}) # Search Fortran module to compile set( FOLDERS BIO General IO MPI namelists PHYS BC) @@ -68,4 +69,4 @@ endforeach() #building add_library( ogstm_lib ${FORTRAN_SOURCES}) add_executable (ogstm.xx application/ogstm_main_caller.f90) -target_link_libraries(ogstm.xx ogstm_lib ${NETCDFF_LIBRARIES_F90} ${BFM_LIBRARIES} MPI::MPI_Fortran) +target_link_libraries(ogstm.xx ogstm_lib ${NETCDFF_LIBRARIES_F90} ${BFM_LIBRARIES} MPI::MPI_Fortran $ENV{SIMPLE_TIMER_LIBS}) diff --git a/application/ogstm_main_caller.f90 b/application/ogstm_main_caller.f90 index 667864d5..06421cf9 100644 --- a/application/ogstm_main_caller.f90 +++ b/application/ogstm_main_caller.f90 @@ -9,6 +9,7 @@ PROGRAM OGSTM_MAIN #ifdef _OPENACC use openacc #endif + use simple_timer implicit none integer :: info, ierr @@ -34,6 +35,7 @@ PROGRAM OGSTM_MAIN !$OMP END MASTER !$OMP END PARALLEL + call tprint() CALL mpi_finalize(info) END PROGRAM OGSTM_MAIN diff --git a/compilers/machine_modules/leonardo.nvhpc b/compilers/machine_modules/leonardo.nvhpc index f4c8394a..644e0093 100644 --- a/compilers/machine_modules/leonardo.nvhpc +++ b/compilers/machine_modules/leonardo.nvhpc @@ -15,3 +15,11 @@ export NETCDF_FFLAGS=$(nf-config --fflags) export NETCDF_FLIBS=$(nf-config --flibs) export NETCDFF_LIB=$(nf-config --prefix)/lib export NETCDFF_INC=$(nf-config --includedir) + +SIMPLE_TIMER_ROOT=/leonardo_work/OGS23_PRACE_IT_0/llucido0/simple-timer/nvhpc--23.11_cuda--12.3_openmpi--4.1.6 +export SIMPLE_TIMER_INCLUDE_DIR="${SIMPLE_TIMER_ROOT}/include" +export SIMPLE_TIMER_FLAGS="-I ${SIMPLE_TIMER_INCLUDE_DIR}" +export SIMPLE_TIMER_LIB_DIR="${SIMPLE_TIMER_ROOT}/lib" +export SIMPLE_TIMER_LIBS="-L ${SIMPLE_TIMER_LIB_DIR} -lsimple_timer -lsimple_timer_f" + +export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:${SIMPLE_TIMER_LIB_DIR}" diff --git a/src/BIO/trcbio.f90 b/src/BIO/trcbio.f90 index c6e52bcc..f84fe51e 100644 --- a/src/BIO/trcbio.f90 +++ b/src/BIO/trcbio.f90 @@ -45,6 +45,7 @@ SUBROUTINE trcbio !---------------------------------------------------------------------- ! END BC_REFACTORING SECTION ! --------------------------------------------------------------------- + use simple_timer IMPLICIT NONE @@ -59,6 +60,8 @@ SUBROUTINE trcbio integer :: year, month, day double precision :: sec + + call tstart("trcbio_1") BIOparttime = MPI_WTIME() ! Initialization @@ -130,11 +133,21 @@ SUBROUTINE trcbio end if end do end do - + + call tstop("trcbio_1") + call tstart("BFM1D_In_EcologyDynamics") call BFM1D_Input_EcologyDynamics(mbathy, er) ! here mbathy was bottom + call tstop("BFM1D_In_EcologyDynamics") + call tstart("BFM1D_reset") call BFM1D_reset() + call tstop("BFM1D_reset") + call tstart("EcologyDynamics") call EcologyDynamics() + call tstop("EcologyDynamics") + call tstart("BFM1D_Out_EcologyDynamics") call BFM1D_Output_EcologyDynamics(sediPPY, local_D3DIAGNOS, local_D2DIAGNOS) + call tstop("BFM1D_Out_EcologyDynamics") + call tstart("trcbio_2") ! The following copies could be avoided do jn = 1, max(4, jptra, jptra_dia) @@ -186,4 +199,5 @@ SUBROUTINE trcbio !--------------------------------------------------------------------- BIOparttime = MPI_WTIME() - BIOparttime BIOtottime = BIOtottime + BIOparttime + call tstop("trcbio_2") END SUBROUTINE trcbio diff --git a/src/BIO/trcsms.f90 b/src/BIO/trcsms.f90 index 7c463d54..fc2ab56d 100644 --- a/src/BIO/trcsms.f90 +++ b/src/BIO/trcsms.f90 @@ -22,6 +22,7 @@ SUBROUTINE trcsms USE myalloc USE mpi + use simple_timer IMPLICIT NONE @@ -34,7 +35,9 @@ SUBROUTINE trcsms CALL trcopt ! tracers: optical model + call tstart("trcbio") CALL trcbio ! tracers: biological model + call tstop("trcbio") !! trcsed no updated for time step advancing #if defined key_trc_sed diff --git a/src/General/step.f90 b/src/General/step.f90 index 80c8ef31..819668ad 100644 --- a/src/General/step.f90 +++ b/src/General/step.f90 @@ -57,6 +57,7 @@ SUBROUTINE step ! trcstp, trcdia passive tracers interface + use simple_timer IMPLICIT NONE @@ -81,8 +82,11 @@ SUBROUTINE step if (IsStartBackup_2) datefrom_2 = BKPdatefrom_2 datestring = DATESTART TAU = 0 + call tstart("step_total") DO WHILE (.not.ISOVERTIME(datestring)) + + call tstart("step_1") stpparttime = MPI_WTIME() ! stop cronomether COMMON_DATESTRING = DATEstring @@ -92,7 +96,9 @@ SUBROUTINE step if(lwp) write(numout,'(A,I8,A,A)') "step ------------ Starting timestep = ",TAU,' time ',DATEstring if(lwp) write(*,'(A,I8,A,A)') "step ------------ Starting timestep = ",TAU,' time ',DATEstring + call tstop("step_1") + call tstart("restart") if (IsaRestart(DATEstring)) then CALL trcwri(DATEstring) ! writes the restart files @@ -114,31 +120,39 @@ SUBROUTINE step B = writeTemporization("trcwri____", trcwritottime) endif endif + call tstop("restart") ! For offline simulation READ DATA or precalculalted dynamics fields ! ------------------------------------------------------------------ + call tstart("forcing") CALL forcings_PHYS(DATEstring) CALL forcings_KEXT(datestring) + call tstop("forcing") ! ---------------------------------------------------------------------- ! BEGIN BC_REFACTORING SECTION ! --------------------------------------------------------------------- + call tstart("boundaries%update") call boundaries%update(datestring) + call tstop("boundaries%update") ! ---------------------------------------------------------------------- ! END BC_REFACTORING SECTION ! --------------------------------------------------------------------- + call tstart("bc+eos") CALL bc_atm (DATEstring) ! CALL dtatrc(istp,2) CALL bc_co2 (DATEstring) CALL eos () ! Water density + call tstop("bc+eos") + call tstart("dump_ave_1") if (IsAnAveDump(DATEstring,1)) then call MIDDLEDATE(datefrom_1, DATEstring, datemean) CALL trcdia(datemean, datefrom_1, datestring,1) @@ -149,7 +163,9 @@ SUBROUTINE step if (lwp) B = writeTemporization("trcdia____", trcdiatottime) endif + call tstop("dump_ave_1") + call tstart("dump_ave_2") if (IsAnAveDump(DATEstring,2)) then call MIDDLEDATE(datefrom_2, DATEstring, datemean) CALL trcdia(datemean, datefrom_2, datestring,2) @@ -158,20 +174,27 @@ SUBROUTINE step IsStartBackup_2 = .false. if (lwp) B = writeTemporization("trcdia____", trcdiatottime) endif + call tstop("dump_ave_2") #ifdef ExecDA + call tstart("data_assim") if (IsaDataAssimilation(DATEstring)) then CALL mainAssimilation(DATEstring, datefrom_1) if (lwp) B = writeTemporization("DATA_ASSIMILATION____", DAparttime) endif + call tstop("data_assim") #endif ! Call Passive tracer model between synchronization for small parallelisation + call tstart("trcstp_all") CALL trcstp ! se commento questo non fa calcoli + call tstop("trcstp_all") + call tstart("trcave") call trcave + call tstop("trcave") elapsed_time_1 = elapsed_time_1 + rdt elapsed_time_2 = elapsed_time_2 + rdt @@ -181,6 +204,7 @@ SUBROUTINE step ! OGSTM TEMPORIZATION + call tstart("temporization") IF (TAU.GT.0) THEN IF( mod( TAU, nwritetrc ).EQ.0) THEN if (lwp) then @@ -216,6 +240,7 @@ SUBROUTINE step call reset_Timers() ENDIF ENDIF + call tstop("temporization") !+++++++++++++++++++++++++++++c @@ -265,20 +290,27 @@ SUBROUTINE trcstp ! with surface boundary condition ! with IMPLICIT vertical diffusion + use simple_timer IMPLICIT NONE integer jn,jk,ji,jj trcstpparttime = MPI_WTIME() ! cronometer-start + call tstart("trcadv") IF (ladv) CALL trcadv ! tracers: advection + call tstop("trcadv") # if defined key_trc_dmp + call tstart("trcdmp") CALL trcdmp ! tracers: damping for passive tracerstrcstp + call tstop("trcdmp") ! ---------------------------------------------------------------------- ! BEGIN BC_REFACTORING SECTION ! --------------------------------------------------------------------- + call tstart("boundaries%apply") call boundaries%apply(e3t, trb, tra) + call tstop("boundaries%apply") ! ---------------------------------------------------------------------- ! END BC_REFACTORING SECTION @@ -289,22 +321,34 @@ SUBROUTINE trcstp ! tracers: horizontal diffusion IF namelist flags are activated ! ----------------------------- + call tstart("trchdf") IF (lhdf) CALL trchdf + call tstop("trchdf") ! tracers: sink and source (must be parallelized on vertical slab) + call tstart("trcsbc") IF (lsbc) CALL trcsbc ! surface cell processes, default lsbc = False + call tstop("trcsbc") + call tstart("trcsms") IF (lbfm) CALL trcsms + call tstop("trcsms") + call tstart("trczdf") IF (lzdf) CALL trczdf ! tracers: vertical diffusion + call tstop("trczdf") + call tstart("snutel") IF (lsnu) CALL snutel + call tstop("snutel") call boundaries%apply_dirichlet() ! CALL checkValues + call tstart("trcadv") CALL trcnxt ! tracers: fields at next time step + call tstop("trcadv") trcstpparttime = MPI_WTIME() - trcstpparttime ! cronometer-stop trcstptottime = trcstptottime + trcstpparttime From 5fdc7feaa3fe939a33ce490f7b3fc3f32c0ad798 Mon Sep 17 00:00:00 2001 From: Loris Lucido Date: Tue, 27 Feb 2024 10:04:20 +0100 Subject: [PATCH 02/33] add -Minfo=all for release flag --- GeneralCmake.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GeneralCmake.cmake b/GeneralCmake.cmake index 76296a45..0fdacb8c 100644 --- a/GeneralCmake.cmake +++ b/GeneralCmake.cmake @@ -40,7 +40,7 @@ elseif(CMAKE_Fortran_COMPILER_ID MATCHES "GNU") set(CMAKE_Fortran_FLAGS_DEBUG " -Og -ggdb3 -fimplicit-none -cpp -ffree-line-length-none -Wall -Wextra -fno-omit-frame-pointer -fbounds-check -pedantic -ffpe-trap=invalid,zero,overflow") set(CMAKE_LINKER_FLAGS_DEBUG "${CMAKE_LINKER_FLAGS_DEBUG} -fno-omit-frame-pointer") elseif(CMAKE_Fortran_COMPILER_ID MATCHES "NVHPC|PGI") - set(CMAKE_Fortran_FLAGS_RELEASE " -Kieee -g -traceback -fast -acc -gpu=pinned -Mextend -Mpreprocess") + set(CMAKE_Fortran_FLAGS_RELEASE " -Kieee -g -traceback -fast -acc -gpu=pinned -Mextend -Mpreprocess -Minfo=accel") set(CMAKE_Fortran_FLAGS_DEBUG " -Kieee -g -traceback -O0 -acc -gpu=pinned,debug -Mbounds -Mextend -Mpreprocess -Minfo=accel") else() message ("CMAKE_Fortran_COMPILER full path: " ${CMAKE_Fortran_COMPILER}) From 9d1f287b0fb42db7924c447418106b9a61d75c59 Mon Sep 17 00:00:00 2001 From: Loris Lucido Date: Mon, 22 Apr 2024 15:31:45 +0200 Subject: [PATCH 03/33] add Minfo=accel and fix one timer --- src/General/step.f90 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/General/step.f90 b/src/General/step.f90 index 819668ad..7fa34014 100644 --- a/src/General/step.f90 +++ b/src/General/step.f90 @@ -346,9 +346,9 @@ SUBROUTINE trcstp ! CALL checkValues - call tstart("trcadv") + call tstart("trcnxt") CALL trcnxt ! tracers: fields at next time step - call tstop("trcadv") + call tstop("trcnxt") trcstpparttime = MPI_WTIME() - trcstpparttime ! cronometer-stop trcstptottime = trcstptottime + trcstpparttime From a42298df290075f92c0e708dd627f85f96777197 Mon Sep 17 00:00:00 2001 From: Loris Lucido Date: Mon, 15 Apr 2024 23:08:33 +0200 Subject: [PATCH 04/33] port trczdf to GPU --- src/General/memory.f90 | 9 +++++++-- src/General/step.f90 | 8 ++++++++ src/IO/DIA_mem.f90 | 2 ++ src/PHYS/ZDF_mem.f90 | 29 ++++++++++++++++++++++++++- src/PHYS/trcadv.f90 | 8 +------- src/PHYS/trczdf.f90 | 45 +++++++++++++++++++++++++++--------------- 6 files changed, 75 insertions(+), 26 deletions(-) diff --git a/src/General/memory.f90 b/src/General/memory.f90 index 2af71f75..975c1480 100644 --- a/src/General/memory.f90 +++ b/src/General/memory.f90 @@ -516,7 +516,7 @@ subroutine alloc_tot() gphiv = huge(gphiv(1,1)) allocate(gphif(jpj,jpi)) gphif = huge(gphif(1,1)) - allocate(e1t(jpj,jpi)) + allocate(e1t(jpj,jpi)) e1t = huge(e1t(1,1)) allocate(e1u(jpj,jpi)) e1u = huge(e1u(1,1)) @@ -790,10 +790,13 @@ subroutine alloc_tot() allocate(DAY_LENGTH(jpj,jpi)) DAY_LENGTH = huge(DAY_LENGTH(1,1)) forcing_phys_initialized = .false. + + !$acc enter data create(e1t,e2t,e3t,e3w,e3t_back,tra,trb,tmask,avt) + #ifdef Mem_Monitor mem_all=get_mem(err) - aux_mem #endif - + END subroutine alloc_tot @@ -802,6 +805,8 @@ subroutine clean_memory() ! myalloc (memory.f90) + !$acc exit data delete(e1t,e2t,e3t,e3w,e3t_back,tra,trb,tmask,avt) + #ifdef key_mpp !$acc exit data delete(te_send, tw_send, tn_send, ts_send) finalize diff --git a/src/General/step.f90 b/src/General/step.f90 index 7fa34014..13402e71 100644 --- a/src/General/step.f90 +++ b/src/General/step.f90 @@ -290,6 +290,10 @@ SUBROUTINE trcstp ! with surface boundary condition ! with IMPLICIT vertical diffusion + ! XXX: to be removed + use DIA_mem, only: diaflx + use myalloc, only: tra,trb,e1t,e3t_back,e2t,e3t,e3w,tmask,avt + use simple_timer IMPLICIT NONE integer jn,jk,ji,jj @@ -335,7 +339,11 @@ SUBROUTINE trcstp call tstop("trcsms") call tstart("trczdf") + + !$acc update device(e1t,diaflx,e3t_back,e2t,trb,tmask,e3t,tra,avt,e3w) if (lzdf) IF (lzdf) CALL trczdf ! tracers: vertical diffusion + !$acc update host(diaflx,tra) if (lzdf) + call tstop("trczdf") call tstart("snutel") diff --git a/src/IO/DIA_mem.f90 b/src/IO/DIA_mem.f90 index e385f32e..223d6c27 100644 --- a/src/IO/DIA_mem.f90 +++ b/src/IO/DIA_mem.f90 @@ -50,6 +50,7 @@ SUBROUTINE alloc_DIA_local_flx() INDflxDUMP = huge(INDflxDUMP(1)) allocate(diaflx (7, Fsize, jptra )) diaflx = 0 + !$acc enter data create(diaflx) END SUBROUTINE alloc_DIA_local_flx @@ -95,6 +96,7 @@ subroutine clean_memory_dia() if (allocated(diaflx)) then deallocate(diaflx) + !$acc exit data delete(diaflx) endif if (allocated(INDflxDUMPZERO)) then diff --git a/src/PHYS/ZDF_mem.f90 b/src/PHYS/ZDF_mem.f90 index cb3fe00b..265e7004 100644 --- a/src/PHYS/ZDF_mem.f90 +++ b/src/PHYS/ZDF_mem.f90 @@ -46,6 +46,9 @@ subroutine myalloc_ZDF() jarr_zdf = huge(jarr_zdf(1,1)) allocate(jarr_zdf_flx(jpi*jpj,jpk)) jarr_zdf_flx = huge(jarr_zdf_flx(1,1)) + !$acc enter data create(jarr_zdf,jarr_zdf_flx) + !$acc update device(jarr_zdf,jarr_zdf_flx) +#ifndef _OPENACC allocate(zwd(jpk, ntids)) zwd = huge(zwd(1,1)) allocate(zws(jpk, ntids)) @@ -60,6 +63,7 @@ subroutine myalloc_ZDF() zwz = huge(zwz(1,1)) allocate(zwt(jpk, ntids)) zwt = huge(zwt(1,1)) +#endif #ifdef Mem_Monitor mem_all=get_mem(err) - aux_mem @@ -67,7 +71,28 @@ subroutine myalloc_ZDF() END subroutine myalloc_ZDF - + +#ifdef _OPENACC + subroutine myalloc_ZDF_gpu() + allocate(zwd(jpk, dimen_jvzdf)) + zwd = huge(zwd(1,1)) + allocate(zws(jpk, dimen_jvzdf)) + zws = huge(zws(1,1)) + allocate(zwi(jpk, dimen_jvzdf)) + zwi = huge(zwi(1,1)) + allocate(zwx(jpk, dimen_jvzdf)) + zwx = huge(zwx(1,1)) + allocate(zwy(jpk, dimen_jvzdf)) + zwy = huge(zwy(1,1)) + allocate(zwz(jpk, dimen_jvzdf)) + zwz = huge(zwz(1,1)) + allocate(zwt(jpk, dimen_jvzdf)) + zwt = huge(zwt(1,1)) + + !$acc enter data create(zwd,zwi,zwx,zws,zwz,zwy,zwt) + !$acc update device(zwd,zwi,zwx,zws,zwz,zwy,zwt) + END subroutine myalloc_ZDF_gpu +#endif subroutine clean_memory_zdf() @@ -82,6 +107,8 @@ subroutine clean_memory_zdf() deallocate(zwz) deallocate(zwt) + !$acc exit data delete(jarr_zdf,jarr_zdf_flx,zwd,zwi,zwx,zws,zwz,zwy,zwt) + end subroutine clean_memory_zdf diff --git a/src/PHYS/trcadv.f90 b/src/PHYS/trcadv.f90 index 129613be..65d0a75b 100644 --- a/src/PHYS/trcadv.f90 +++ b/src/PHYS/trcadv.f90 @@ -174,10 +174,8 @@ SUBROUTINE trcadv !$acc enter data create( big_fact_zaa (1:jpk,1:jpj,1:jpi), big_fact_zbb(1:jpk,1:jpj,1:jpi), big_fact_zcc(1:jpk,1:jpj,1:jpi) ) if(use_gpu) !$acc enter data create( zbtr_arr(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc enter data create( e1t(1:jpj,1:jpi), e2t(1:jpj,1:jpi), e3t(1:jpk,1:jpj,1:jpi) ) if(use_gpu) !$acc enter data create( e1u(1:jpj,1:jpi), e2u(1:jpj,1:jpi), e3u(1:jpk,1:jpj,1:jpi) ) if(use_gpu) !$acc enter data create( e1v(1:jpj,1:jpi), e2v(1:jpj,1:jpi), e3v(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc enter data create( e3w(1:jpk,1:jpj,1:jpi) ) if(use_gpu) !$acc enter data create( un(1:jpk,1:jpj,1:jpi), vn(1:jpk,1:jpj,1:jpi), wn(1:jpk,1:jpj,1:jpi) ) if(use_gpu) !$acc update device( zaa(1:jpk,1:jpj,1:jpi), zbb(1:jpk,1:jpj,1:jpi), zcc(1:jpk,1:jpj,1:jpi) ) if(use_gpu) @@ -353,10 +351,8 @@ SUBROUTINE trcadv !!trn could be allocate earlier !$acc enter data create(trn(1:jpk,1:jpj,1:jpi,1:jptra)) if(use_gpu) - !$acc enter data create(tra(1:jpk,1:jpj,1:jpi,1:jptra)) if(use_gpu) !$acc enter data create(advmask(1:jpk,1:jpj,1:jpi)) if(use_gpu) !$acc enter data create(flx_ridxt(1:Fsize,1:4)) if(use_gpu) - !$acc enter data create( diaflx(1:7, 1:Fsize, 1:jptra)) if(use_gpu) !$acc enter data create( zy(1:jpk,1:jpj,1:jpi), zx(1:jpk,1:jpj,1:jpi), zz(1:jpk,1:jpj,1:jpi) ) if(use_gpu) !$acc enter data create( ztj(1:jpk,1:jpj,1:jpi), zti(1:jpk,1:jpj,1:jpi) ) if(use_gpu) @@ -946,9 +942,7 @@ SUBROUTINE trcadv !$acc update host( zkx(1:jpk,1:jpj,1:jpi), zky(1:jpk,1:jpj,1:jpi), zkz(1:jpk,1:jpj,1:jpi) ) if(use_gpu) !$acc update host( zbuf(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc exit data delete( tra) finalize if(use_gpu) !$acc exit data delete( trn, advmask ) finalize if(use_gpu) - !$acc exit data delete( flx_ridxt, diaflx ) finalize if(use_gpu) !$acc exit data delete( zy, zx, zz, ztj, zti, zkx, zky, zkz, zbuf ) finalize if(use_gpu) !!OpenMP compatibility broken. Possibility to use ifndef OpenMP + rename the file in trcadv.F90 to keep it @@ -963,7 +957,7 @@ SUBROUTINE trcadv deallocate(zbuf ) !$acc exit data delete( zaa, zbb, zcc, inv_eu, inv_ev, inv_et, big_fact_zaa , big_fact_zbb, big_fact_zcc, zbtr_arr ) finalize if(use_gpu) - !$acc exit data delete( e1t, e2t, e3t, e1u, e2u, e3u, e1v, e2v, e3v, e3w, un, vn, wn ) finalize if(use_gpu) + !$acc exit data delete( e1u, e2u, e3u, e1v, e2v, e3v, un, vn, wn ) finalize if(use_gpu) trcadvparttime = MPI_WTIME() - trcadvparttime trcadvtottime = trcadvtottime + trcadvparttime diff --git a/src/PHYS/trczdf.f90 b/src/PHYS/trczdf.f90 index 0a02d892..7dc67328 100644 --- a/src/PHYS/trczdf.f90 +++ b/src/PHYS/trczdf.f90 @@ -69,7 +69,7 @@ SUBROUTINE trczdf LOGICAL :: l1,l2,l3 - INTEGER :: jk,jj,ji, jn, jv, jf + INTEGER :: jk,jj,ji, jn, jv, jf, ntx ! omp variables @@ -113,7 +113,11 @@ SUBROUTINE trczdf END DO END DO END DO - + !$acc enter data create(delta_tra,int_tra) + !$acc update device(jarr_zdf,jarr_zdf_flx) +#ifdef _OPENACC + call myalloc_ZDF_gpu() +#endif ENDIF @@ -129,12 +133,19 @@ SUBROUTINE trczdf ztavg = 0.e0 !! vertical slab + !$acc parallel loop gang vector default(present) async DO jv = 1, dimen_jvzdf ji = jarr_zdf(2,jv) jj = jarr_zdf(1,jv) Aij = e1t(jj,ji) * e2t(jj,ji) +#ifdef _OPENACC + ntx=jv +#else + ntx=1 +#endif + !! I. Vertical trends associated with lateral mixing !! ------------------------------------------------- !! (excluding the vertical flux proportional to dk[t] ) @@ -155,14 +166,14 @@ SUBROUTINE trczdf !! ... Euler time stepping when starting from rest DO jk = 1, jpkm1 z2dtt = zdt * rdt - zwi(jk, 1) = - z2dtt * avt(jk,jj,ji )/( e3t(jk,jj,ji) * e3w(jk,jj,ji ) ) - zws(jk, 1) = - z2dtt * avt(jk+1,jj,ji)/( e3t(jk,jj,ji) * e3w(jk+1,jj,ji) ) - zwd(jk, 1) = 1. - zwi(jk, 1) - zws(jk, 1) + zwi(jk, ntx) = - z2dtt * avt(jk,jj,ji )/( e3t(jk,jj,ji) * e3w(jk,jj,ji ) ) + zws(jk, ntx) = - z2dtt * avt(jk+1,jj,ji)/( e3t(jk,jj,ji) * e3w(jk+1,jj,ji) ) + zwd(jk, ntx) = 1. - zwi(jk, ntx) - zws(jk, ntx) END DO !! Surface boundary conditions - zwi(1,1) = 0.e0 - zwd(1,1) = 1. - zws(1,1) + zwi(1,ntx) = 0.e0 + zwd(1,ntx) = 1. - zws(1,ntx) !! II.1. Vertical diffusion on tr !! ------------------------------ @@ -170,7 +181,7 @@ SUBROUTINE trczdf !! ... Euler time stepping when starting from rest DO jk = 1, jpkm1 z2dtt = zdt * rdt - zwy(jk,1) = trb(jk,jj,ji,jn)*e3t_back(jk,jj,ji)/e3t(jk,jj,ji) + z2dtt * tra(jk,jj,ji,jn) + zwy(jk,ntx) = trb(jk,jj,ji,jn)*e3t_back(jk,jj,ji)/e3t(jk,jj,ji) + z2dtt * tra(jk,jj,ji,jn) END DO !! Matrix inversion from the first level @@ -208,22 +219,22 @@ SUBROUTINE trczdf ikstp1=ikst+1 ikenm2=jpk-2 - zwt(ikst,1)=zwd(ikst,1) + zwt(ikst,ntx)=zwd(ikst,ntx) DO jk=ikstp1,jpkm1 - zwt(jk,1)=zwd(jk,1)-zwi(jk,1)*zws(jk-1,1)/zwt(jk-1,1) + zwt(jk,ntx)=zwd(jk,ntx)-zwi(jk,ntx)*zws(jk-1,ntx)/zwt(jk-1,ntx) END DO - zwz(ikst,1)=zwy(ikst,1) + zwz(ikst,ntx)=zwy(ikst,ntx) DO jk=ikstp1,jpkm1 - zwz(jk,1)=zwy(jk,1)-zwi(jk, 1)/zwt(jk-1, 1)*zwz(jk-1, 1) + zwz(jk,ntx)=zwy(jk,ntx)-zwi(jk, ntx)/zwt(jk-1, ntx)*zwz(jk-1, ntx) END DO - zwx(jpkm1, 1)=zwz(jpkm1, 1)/zwt(jpkm1, 1) + zwx(jpkm1, ntx)=zwz(jpkm1, ntx)/zwt(jpkm1, ntx) DO jk=ikenm2,ikst,-1 - zwx(jk, 1)=( zwz(jk, 1)-zws(jk, 1)*zwx(jk+1, 1) )/zwt(jk, 1) + zwx(jk, ntx)=( zwz(jk, ntx)-zws(jk, ntx)*zwx(jk+1, ntx) )/zwt(jk, ntx) END DO ! calculate flux due to vertical diffusion (on top face of the grid cell jk) @@ -232,7 +243,7 @@ SUBROUTINE trczdf DO jk=1,jpkm1 z2dtt = zdt * rdt - delta_tra(jk) = ( zwx(jk,1) - zwy(jk,1) ) / z2dtt * Aij * e3t(jk,jj,ji)! or trn(jk,jj,ji,jn+mytid) + delta_tra(jk) = ( zwx(jk,ntx) - zwy(jk,ntx) ) / z2dtt * Aij * e3t(jk,jj,ji)! or trn(jk,jj,ji,jn+mytid) IF (jk .EQ. 1) THEN int_tra(1) = 0 @@ -258,16 +269,18 @@ SUBROUTINE trczdf !! (c a u t i o n: tracer not its trend, Leap-frog scheme done !! it will not be done in trcnxt) DO jk = 1, jpkm1 - tra(jk,jj,ji,jn) = zwx(jk,1) * tmask(jk,jj,ji) + tra(jk,jj,ji,jn) = zwx(jk,ntx) * tmask(jk,jj,ji) END DO END DO ! jv + !$acc end parallel loop ! end if END DO TRACER_LOOP !!!$omp end parallel do + !$acc wait trczdfparttime = MPI_WTIME() - trczdfparttime !cronometer-stop From fed91cb702a5f26d2314f4e29e127a1cf133c43b Mon Sep 17 00:00:00 2001 From: Loris Lucido Date: Tue, 16 Apr 2024 09:13:08 +0200 Subject: [PATCH 05/33] reduce vector length --- src/PHYS/trczdf.f90 | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/PHYS/trczdf.f90 b/src/PHYS/trczdf.f90 index 7dc67328..a1b02ee3 100644 --- a/src/PHYS/trczdf.f90 +++ b/src/PHYS/trczdf.f90 @@ -133,7 +133,8 @@ SUBROUTINE trczdf ztavg = 0.e0 !! vertical slab - !$acc parallel loop gang vector default(present) async + ! NOTE: kernel is too big, should be split + !$acc parallel loop gang vector default(present) async vector_length(32) DO jv = 1, dimen_jvzdf ji = jarr_zdf(2,jv) From a066992ca30b941adf2ef8cad164514a3f41111f Mon Sep 17 00:00:00 2001 From: Loris Lucido Date: Wed, 24 Apr 2024 08:48:10 +0200 Subject: [PATCH 06/33] move allocations, put async everywhere, use fsx macro --- src/General/memory.f90 | 7 +- src/IO/DIA_mem.f90 | 2 +- src/PHYS/ADV_mem.f90 | 6 ++ src/PHYS/trcadv.f90 | 235 +++++++++++++++++++---------------------- 4 files changed, 123 insertions(+), 127 deletions(-) diff --git a/src/General/memory.f90 b/src/General/memory.f90 index 975c1480..e8497bd9 100644 --- a/src/General/memory.f90 +++ b/src/General/memory.f90 @@ -791,7 +791,8 @@ subroutine alloc_tot() DAY_LENGTH = huge(DAY_LENGTH(1,1)) forcing_phys_initialized = .false. - !$acc enter data create(e1t,e2t,e3t,e3w,e3t_back,tra,trb,tmask,avt) + !$acc enter data create(e1t,e2t,e3t,e3w,e3t_back,tra,trb,tmask,avt,& + !$acc& e1u,e2u,e3u,e1v,e2v,e3v,un,vn,wn,trn) #ifdef Mem_Monitor mem_all=get_mem(err) - aux_mem @@ -958,7 +959,7 @@ subroutine clean_memory() deallocate(tra_DIA_2d_IO_HIGH) deallocate(tra_PHYS_2d_IO) deallocate(tra_PHYS_2d_IO_HIGH) - + if(lwp) then deallocate(tottrn) @@ -1000,6 +1001,8 @@ subroutine clean_memory() deallocate(highfreq_table_dia) deallocate(highfreq_table_dia2d) + !$acc exit data delete(trn, e1u, e2u, e3u, e1v, e2v, e3v, un, vn, wn) + end subroutine clean_memory INTEGER FUNCTION find_index_var(string) diff --git a/src/IO/DIA_mem.f90 b/src/IO/DIA_mem.f90 index 223d6c27..4b4a896a 100644 --- a/src/IO/DIA_mem.f90 +++ b/src/IO/DIA_mem.f90 @@ -50,7 +50,7 @@ SUBROUTINE alloc_DIA_local_flx() INDflxDUMP = huge(INDflxDUMP(1)) allocate(diaflx (7, Fsize, jptra )) diaflx = 0 - !$acc enter data create(diaflx) + !$acc enter data create(flx_ridxt,diaflx) END SUBROUTINE alloc_DIA_local_flx diff --git a/src/PHYS/ADV_mem.f90 b/src/PHYS/ADV_mem.f90 index 9d201de7..d1bba762 100644 --- a/src/PHYS/ADV_mem.f90 +++ b/src/PHYS/ADV_mem.f90 @@ -114,6 +114,9 @@ subroutine myalloc_ADV() mem_all=get_mem(err) - aux_mem #endif + !$acc enter data create(advmask,zaa,zbb,zcc,inv_eu,inv_ev,inv_et,& + !$acc& big_fact_zaa,big_fact_zbb,big_fact_zcc,zbtr_arr) + END subroutine myalloc_ADV !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!1 @@ -133,6 +136,9 @@ subroutine clean_memory_adv() deallocate(big_fact_zcc) deallocate(zbtr_arr) + !$acc exit data delete(advmask,zaa,zbb,zcc,inv_eu,inv_ev,inv_et,& + !$acc& big_fact_zaa,big_fact_zbb,big_fact_zcc,zbtr_arr) + end subroutine clean_memory_adv diff --git a/src/PHYS/trcadv.f90 b/src/PHYS/trcadv.f90 index 65d0a75b..52862cf0 100644 --- a/src/PHYS/trcadv.f90 +++ b/src/PHYS/trcadv.f90 @@ -1,3 +1,8 @@ +#ifdef _OPENACC +! BUG?: the fsx routine causes additional H2D copies +#define fsx(pfx1, pfx2, pfu) ((((pfu) + abs(pfu)) * (pfx1) + ((pfu) - abs(pfu)) * (pfx2)) * 0.5) +#endif + SUBROUTINE trcadv !!!--------------------------------------------------------------------- !!! @@ -68,7 +73,7 @@ SUBROUTINE trcadv !! monthly weather review, pp 479-486 !! LOGICAL :: MPI_CHECK,l1,l2,l3 - INTEGER :: jk,jj,ji,jt,jn,jf,ju + INTEGER :: jk,jj,ji,jt,jn,jf,ju,queue double precision :: zbtr,zdt double precision :: junk, junki, junkj, junkk double precision :: timer @@ -78,10 +83,16 @@ SUBROUTINE trcadv double precision, allocatable,dimension(:,:,:) :: zx,zy,zz,zbuf double precision, allocatable,dimension(:,:,:) :: zkx,zky,zkz logical :: use_gpu + + queue=1 trcadvparttime = MPI_WTIME() +#ifdef _OPENACC use_gpu=.true. +#else + use_gpu=.false. +#endif !------------------------------------------------------------------- @@ -159,8 +170,22 @@ SUBROUTINE trcadv adv_initialized=.true. + endif + !!OpenMP compatibility broken. Possibility to use ifndef OpenMP + rename the file in trcadv.F90 to keep it + allocate(zy(jpk,jpj,jpi)) + allocate(zx(jpk,jpj,jpi)) + allocate(zz(jpk,jpj,jpi)) + allocate(ztj(jpk,jpj,jpi)) + allocate(zti(jpk,jpj,jpi)) + allocate(zkx(jpk,jpj,jpi)) + allocate(zky(jpk,jpj,jpi)) + allocate(zkz(jpk,jpj,jpi)) + allocate(zbuf(jpk,jpj,jpi)) + + !$acc enter data create(zy,zx,zz,ztj,zti,zkx,zky,zkz,zbuf) + !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! end initialization phase jk=1 @@ -168,28 +193,24 @@ SUBROUTINE trcadv zdt = rdt*ndttrc !$OMP TASK private(ji,jj) firstprivate(jpim1,jpjm1) shared(zbtr_arr,e1t,e2t,e3t) default(none) + !$acc update device( zaa(1:jpk,1:jpj,1:jpi), zbb(1:jpk,1:jpj,1:jpi), zcc(1:jpk,1:jpj,1:jpi) ) + !$acc update device( inv_eu(1:jpk,1:jpj,1:jpi), inv_ev(1:jpk,1:jpj,1:jpi), inv_et(1:jpk,1:jpj,1:jpi) ) + !$acc update device( big_fact_zaa (1:jpk,1:jpj,1:jpi), big_fact_zbb(1:jpk,1:jpj,1:jpi), big_fact_zcc(1:jpk,1:jpj,1:jpi) ) + !$acc update device( zbtr_arr(1:jpk,1:jpj,1:jpi) ) - !$acc enter data create( zaa(1:jpk,1:jpj,1:jpi), zbb(1:jpk,1:jpj,1:jpi), zcc(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc enter data create( inv_eu(1:jpk,1:jpj,1:jpi), inv_ev(1:jpk,1:jpj,1:jpi), inv_et(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc enter data create( big_fact_zaa (1:jpk,1:jpj,1:jpi), big_fact_zbb(1:jpk,1:jpj,1:jpi), big_fact_zcc(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc enter data create( zbtr_arr(1:jpk,1:jpj,1:jpi) ) if(use_gpu) + !$acc update device( e1t(1:jpj,1:jpi), e2t(1:jpj,1:jpi), e3t(1:jpk,1:jpj,1:jpi) ) + !$acc update device( e1u(1:jpj,1:jpi), e2u(1:jpj,1:jpi), e3u(1:jpk,1:jpj,1:jpi) ) + !$acc update device( e1v(1:jpj,1:jpi), e2v(1:jpj,1:jpi), e3v(1:jpk,1:jpj,1:jpi) ) + !$acc update device( e3w(1:jpk,1:jpj,1:jpi) ) + !$acc update device( un(1:jpk,1:jpj,1:jpi), vn(1:jpk,1:jpj,1:jpi), wn(1:jpk,1:jpj,1:jpi) ) - !$acc enter data create( e1u(1:jpj,1:jpi), e2u(1:jpj,1:jpi), e3u(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc enter data create( e1v(1:jpj,1:jpi), e2v(1:jpj,1:jpi), e3v(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc enter data create( un(1:jpk,1:jpj,1:jpi), vn(1:jpk,1:jpj,1:jpi), wn(1:jpk,1:jpj,1:jpi) ) if(use_gpu) + !$acc update device(tra(1:jpk,1:jpj,1:jpi,1:jptra)) + !$acc update device(trn(1:jpk,1:jpj,1:jpi,1:jptra)) + !$acc update device(advmask(1:jpk,1:jpj,1:jpi)) + !$acc update device(flx_ridxt(1:Fsize,1:4)) + !$acc update device( diaflx(1:7, 1:Fsize, 1:jptra)) - !$acc update device( zaa(1:jpk,1:jpj,1:jpi), zbb(1:jpk,1:jpj,1:jpi), zcc(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc update device( inv_eu(1:jpk,1:jpj,1:jpi), inv_ev(1:jpk,1:jpj,1:jpi), inv_et(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc update device( big_fact_zaa (1:jpk,1:jpj,1:jpi), big_fact_zbb(1:jpk,1:jpj,1:jpi), big_fact_zcc(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc update device( zbtr_arr(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - - !$acc update device( e1t(1:jpj,1:jpi), e2t(1:jpj,1:jpi), e3t(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc update device( e1u(1:jpj,1:jpi), e2u(1:jpj,1:jpi), e3u(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc update device( e1v(1:jpj,1:jpi), e2v(1:jpj,1:jpi), e3v(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc update device( e3w(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc update device( un(1:jpk,1:jpj,1:jpi), vn(1:jpk,1:jpj,1:jpi), wn(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 1,jpi DO jj = 1,jpj !dir$ vector aligned @@ -204,7 +225,7 @@ SUBROUTINE trcadv !$OMP TASK private(ji,jj) firstprivate(jpim1,jpjm1,jpi,jpj,jpk) default(none) & !$OMP shared(zdt,zaa,inv_eu,e1u,e2u,e3u,un,big_fact_zaa) - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 1,jpi DO jj = 1,jpj !dir$ vector aligned @@ -216,7 +237,7 @@ SUBROUTINE trcadv !$acc end kernels - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 1,jpi DO jj = 1,jpj !dir$ vector aligned @@ -227,7 +248,7 @@ SUBROUTINE trcadv END DO !$acc end kernels - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 1,jpi DO jj = 1,jpj !dir$ vector aligned @@ -244,7 +265,7 @@ SUBROUTINE trcadv !$OMP TASK private(ji,jj) firstprivate(jpim1,jpjm1,jpi,jpj,jpk) default(none) & !$OMP shared(inv_ev,e1v,e2v,e3v,vn,zdt,zbb,big_fact_zbb) - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 1,jpi DO jj = 1,jpj !dir$ vector aligned @@ -255,7 +276,7 @@ SUBROUTINE trcadv END DO !$acc end kernels - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 1,jpi DO jj = 1,jpj !dir$ vector aligned @@ -266,7 +287,7 @@ SUBROUTINE trcadv END DO !$acc end kernels - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 1,jpi DO jj = 1,jpj !dir$ vector aligned @@ -281,7 +302,7 @@ SUBROUTINE trcadv !$OMP TASK private(ji,jj) firstprivate(jpim1,jpjm1,jpi,jpj,jpk) default(none) & !$OMP shared(inv_et,e1t,e2t,e3w,wn,zcc,zdt,big_fact_zcc) - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 1,jpi DO jj = 1,jpj !dir$ vector aligned @@ -292,7 +313,7 @@ SUBROUTINE trcadv END DO !$acc end kernels - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 1,jpi DO jj = 1,jpj !dir$ vector aligned @@ -303,7 +324,7 @@ SUBROUTINE trcadv END DO !$acc end kernels - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 1,jpi DO jj = 1,jpj !dir$ vector aligned @@ -317,55 +338,12 @@ SUBROUTINE trcadv !$OMP TASKWAIT - !$acc update host( zaa(1:jpk,1:jpj,1:jpi), zbb(1:jpk,1:jpj,1:jpi), zcc(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc update host( inv_eu(1:jpk,1:jpj,1:jpi), inv_ev(1:jpk,1:jpj,1:jpi), inv_et(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc update host( big_fact_zaa (1:jpk,1:jpj,1:jpi), big_fact_zbb(1:jpk,1:jpj,1:jpi), big_fact_zcc(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc update host( zbtr_arr(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - + !$acc wait(queue) !! tracer loop parallelized (macrotasking) !! ======================================= - - - !!OpenMP compatibility broken. Possibility to use ifndef OpenMP + rename the file in trcadv.F90 to keep it - allocate(zy(jpk,jpj,jpi)) - allocate(zx(jpk,jpj,jpi)) - allocate(zz(jpk,jpj,jpi)) - allocate(ztj(jpk,jpj,jpi)) - allocate(zti(jpk,jpj,jpi)) - allocate(zkx(jpk,jpj,jpi)) - allocate(zky(jpk,jpj,jpi)) - allocate(zkz(jpk,jpj,jpi)) - allocate(zbuf(jpk,jpj,jpi)) - - zy(:,:,:) = 0 - zz(:,:,:) = 0 - zx(:,:,:) = 0 - ztj(:,:,:)= 0 - zti(:,:,:)= 0 - zbuf(:,:,:) = 0. - zkx(:,:,:)=0. - zky(:,:,:)=0. - zkz(:,:,:)=0. - - !!trn could be allocate earlier - !$acc enter data create(trn(1:jpk,1:jpj,1:jpi,1:jptra)) if(use_gpu) - !$acc enter data create(advmask(1:jpk,1:jpj,1:jpi)) if(use_gpu) - !$acc enter data create(flx_ridxt(1:Fsize,1:4)) if(use_gpu) - - !$acc enter data create( zy(1:jpk,1:jpj,1:jpi), zx(1:jpk,1:jpj,1:jpi), zz(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc enter data create( ztj(1:jpk,1:jpj,1:jpi), zti(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc enter data create( zkx(1:jpk,1:jpj,1:jpi), zky(1:jpk,1:jpj,1:jpi), zkz(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc enter data create( zbuf(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - - !$acc update device(tra(1:jpk,1:jpj,1:jpi,1:jptra)) if(use_gpu) - !$acc update device(trn(1:jpk,1:jpj,1:jpi,1:jptra)) if(use_gpu) - !$acc update device(advmask(1:jpk,1:jpj,1:jpi)) if(use_gpu) - !$acc update device(flx_ridxt(1:Fsize,1:4)) if(use_gpu) - !$acc update device( diaflx(1:7, 1:Fsize, 1:jptra)) if(use_gpu) - - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 1, jpi DO jj = 1, jpj DO jk = 1, jpk @@ -382,6 +360,7 @@ SUBROUTINE trcadv ENDDO ENDDO !$acc end kernels + !$acc wait(queue) !$omp taskloop default(none) private(jf,junk,junki,junkj,junkk,zbtr) & !$omp private(zkx,zky,zkz,zti,ztj,zx,zy,zz,zbuf) shared(diaflx,jarrt,tra,zdt) & @@ -400,8 +379,8 @@ SUBROUTINE trcadv !! and mass fluxes calculated above !! calcul of tracer flux in the i and j direction - !$acc kernels async default(present) if(use_gpu) - !$acc loop independent + + !$acc kernels default(present) async(queue) DO ji = 2,jpim1 !dir$ vector aligned DO jj = 2,jpjm1 @@ -410,7 +389,7 @@ SUBROUTINE trcadv END DO !$acc end kernels - !$acc kernels async default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 2,jpim1 !dir$ vector aligned !$acc loop independent @@ -420,7 +399,7 @@ SUBROUTINE trcadv END DO !$acc end kernels - !$acc kernels async default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 1,jpi !dir$ vector aligned !$acc loop independent @@ -430,7 +409,7 @@ SUBROUTINE trcadv ENDDO !$acc end kernels - !$acc kernels async default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 1,jpi !dir$ vector aligned !$acc loop independent @@ -440,7 +419,7 @@ SUBROUTINE trcadv END DO !$acc end kernels ! loop unfusion - !$acc kernels async default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO jj = 2,jpjm1 !dir$ vector aligned !$acc loop independent @@ -450,7 +429,7 @@ SUBROUTINE trcadv END DO !$acc end kernels - !$acc kernels async default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO jj = 2,jpjm1 !dir$ vector aligned !$acc loop independent @@ -460,7 +439,7 @@ SUBROUTINE trcadv END DO !$acc end kernels - !$acc kernels async default(present) if(use_gpu) + !$acc kernels default(present) async(queue) !$acc loop independent DO ji = 2,jpim1 DO jj = 2,jpjm1 @@ -472,7 +451,7 @@ SUBROUTINE trcadv END DO !$acc end kernels - !$acc kernels async default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 2,jpim1 !$acc loop independent DO jj = 2,jpjm1 @@ -484,7 +463,7 @@ SUBROUTINE trcadv END DO !$acc end kernels - !$acc parallel loop async collapse(3) gang vector default(present) if(use_gpu) + !$acc parallel loop collapse(3) gang vector default(present) async(queue) DO ji = 2,jpim1 DO jj = 2,jpjm1 !dir$ vector aligned @@ -494,7 +473,7 @@ SUBROUTINE trcadv END DO END DO !$acc end parallel loop - !$acc wait + !$acc wait(queue) ! ... Lateral boundary conditions on zk[xy] #ifdef key_mpp @@ -522,7 +501,7 @@ SUBROUTINE trcadv !! 2. calcul of after field using an upstream advection scheme !! ----------------------------------------------------------- - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji =2,jpim1 DO jj =2,jpjm1 DO jk =1,jpkm1 @@ -535,7 +514,7 @@ SUBROUTINE trcadv ENDDO !$acc end kernels - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO jf=1,Fsize jk = flx_ridxt(jf,2) jj = flx_ridxt(jf,3) @@ -547,6 +526,8 @@ SUBROUTINE trcadv ENDDO !$acc end kernels + !$acc wait(queue) + !! 2.1 start of antidiffusive correction loop @@ -561,7 +542,7 @@ SUBROUTINE trcadv if(jt .EQ. 1) then if(ncor .EQ. 1) then - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 2,jpim1 DO jj = 2,jpjm1 !dir$ vector aligned @@ -573,7 +554,7 @@ SUBROUTINE trcadv !$acc end kernels else - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 2,jpim1 DO jj = 2,jpjm1 !dir$ vector aligned @@ -585,7 +566,7 @@ SUBROUTINE trcadv END DO !$acc end kernels - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 2,jpim1 DO jj = 2,jpjm1 !dir$ vector aligned @@ -600,7 +581,7 @@ SUBROUTINE trcadv endif else - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 2,jpim1 DO jj = 2,jpjm1 !dir$ vector aligned @@ -611,7 +592,7 @@ SUBROUTINE trcadv END DO !$acc end kernels - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 2,jpim1 DO jj = 2,jpjm1 !dir$ vector aligned @@ -623,8 +604,7 @@ SUBROUTINE trcadv !$acc end kernels endif - - + !$acc wait(queue) !! ... Lateral boundary conditions on zti #ifdef key_mpp @@ -644,7 +624,7 @@ SUBROUTINE trcadv !jk = 1 ! DO jk = 1,jpkm1 - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 2,jpim1 DO jj = 2,jpjm1 junk = zti(1,jj,ji ) @@ -657,7 +637,7 @@ SUBROUTINE trcadv !$acc end kernels !DO ju=1, dimen_jarr2 - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 2,jpim1 DO jj = 2,jpjm1 !dir$ vector aligned @@ -683,6 +663,7 @@ SUBROUTINE trcadv !$acc end kernels ! endif + !$acc wait(queue) ! ... Lateral boundary conditions on z[xyz] #ifdef key_mpp @@ -709,7 +690,7 @@ SUBROUTINE trcadv !! 2.5 calcul of the final field: !! advection by antidiffusive mass fluxes and an upstream scheme - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) !$acc loop independent DO ji = 2,jpim1 !dir$ vector aligned @@ -719,7 +700,7 @@ SUBROUTINE trcadv END DO !$acc end kernels - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 2,jpim1 !dir$ vector aligned !$acc loop independent @@ -729,7 +710,7 @@ SUBROUTINE trcadv END DO !$acc end kernels - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 1,jpi !dir$ vector aligned !$acc loop independent @@ -739,7 +720,7 @@ SUBROUTINE trcadv ENDDO !$acc end kernels - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 1,jpi !dir$ vector aligned !$acc loop independent @@ -749,7 +730,7 @@ SUBROUTINE trcadv ENDDO !$acc end kernels - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO jj = 2,jpjm1 !dir$ vector aligned !$acc loop independent @@ -759,7 +740,7 @@ SUBROUTINE trcadv ENDDO !$acc end kernels - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO jj = 2,jpjm1 !dir$ vector aligned !$acc loop independent @@ -769,7 +750,7 @@ SUBROUTINE trcadv END DO !$acc end kernels - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) !$acc loop independent DO ji = 2,jpim1 DO jj = 2,jpjm1 @@ -783,7 +764,7 @@ SUBROUTINE trcadv END DO !$acc end kernels - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 2,jpim1 !$acc loop independent DO jj = 2,jpjm1 @@ -797,7 +778,7 @@ SUBROUTINE trcadv END DO !$acc end kernels - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji = 2,jpim1 DO jj = 2,jpjm1 !dir$ vector aligned @@ -811,6 +792,7 @@ SUBROUTINE trcadv END DO !$acc end kernels + !$acc wait(queue) !... Lateral boundary conditions on zk[xy] #ifdef key_mpp @@ -833,7 +815,7 @@ SUBROUTINE trcadv if(ncor .EQ. 1) then - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji =2,jpim1 DO jj =2,jpjm1 DO jk =1,jpkm1 @@ -844,7 +826,7 @@ SUBROUTINE trcadv ENDDO !$acc end kernels - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO jf=1,Fsize jk = flx_ridxt(jf,2) jj = flx_ridxt(jf,3) @@ -859,7 +841,7 @@ SUBROUTINE trcadv else - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO ji =2,jpim1 DO jj =2,jpjm1 DO jk =1,jpkm1 @@ -870,7 +852,7 @@ SUBROUTINE trcadv ENDDO !$acc end kernels - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) DO jf=1,Fsize jk = flx_ridxt(jf,2) jj = flx_ridxt(jf,3) @@ -882,8 +864,8 @@ SUBROUTINE trcadv ENDDO !$acc end kernels - endif + !$acc wait(queue) ENDDO ANTIDIFF_CORR @@ -894,7 +876,7 @@ SUBROUTINE trcadv if(ncor .EQ. 1) then - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) do ji=1,jpi do jj=1,jpj do jk=1,jpk @@ -906,7 +888,7 @@ SUBROUTINE trcadv else - !$acc kernels default(present) if(use_gpu) + !$acc kernels default(present) async(queue) do ji=1,jpi do jj=1,jpj do jk=1,jpk @@ -918,6 +900,8 @@ SUBROUTINE trcadv endif + !$acc wait(queue) + !!$ !!OpenMP compatibility broken. Possibility to use ifdef OpenMP + rename the file in trcadv.F90 to keep it !!$ deallocate(zy ) !!$ deallocate(zx ) @@ -932,19 +916,24 @@ SUBROUTINE trcadv END DO TRACER_LOOP + !$acc wait(queue) + !$OMP end taskloop - !$acc update host( diaflx(1:7, 1:Fsize, 1:jptra) ) if(use_gpu) - !$acc update host( tra(1:jpk,1:jpj,1:jpi,1:jptra) ) if(use_gpu) + !$acc update host( zaa(1:jpk,1:jpj,1:jpi), zbb(1:jpk,1:jpj,1:jpi), zcc(1:jpk,1:jpj,1:jpi) ) + !$acc update host( inv_eu(1:jpk,1:jpj,1:jpi), inv_ev(1:jpk,1:jpj,1:jpi), inv_et(1:jpk,1:jpj,1:jpi) ) + !$acc update host( big_fact_zaa (1:jpk,1:jpj,1:jpi), big_fact_zbb(1:jpk,1:jpj,1:jpi), big_fact_zcc(1:jpk,1:jpj,1:jpi) ) + !$acc update host( zbtr_arr(1:jpk,1:jpj,1:jpi) ) - !$acc update host( zy(1:jpk,1:jpj,1:jpi), zx(1:jpk,1:jpj,1:jpi), zz(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc update host( ztj(1:jpk,1:jpj,1:jpi), zti(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc update host( zkx(1:jpk,1:jpj,1:jpi), zky(1:jpk,1:jpj,1:jpi), zkz(1:jpk,1:jpj,1:jpi) ) if(use_gpu) - !$acc update host( zbuf(1:jpk,1:jpj,1:jpi) ) if(use_gpu) + !$acc update host( diaflx(1:7, 1:Fsize, 1:jptra) ) + !$acc update host( tra(1:jpk,1:jpj,1:jpi,1:jptra) ) - !$acc exit data delete( trn, advmask ) finalize if(use_gpu) - !$acc exit data delete( zy, zx, zz, ztj, zti, zkx, zky, zkz, zbuf ) finalize if(use_gpu) + !$acc update host( zy(1:jpk,1:jpj,1:jpi), zx(1:jpk,1:jpj,1:jpi), zz(1:jpk,1:jpj,1:jpi) ) + !$acc update host( ztj(1:jpk,1:jpj,1:jpi), zti(1:jpk,1:jpj,1:jpi) ) + !$acc update host( zkx(1:jpk,1:jpj,1:jpi), zky(1:jpk,1:jpj,1:jpi), zkz(1:jpk,1:jpj,1:jpi) ) + !$acc update host( zbuf(1:jpk,1:jpj,1:jpi) ) + !$acc exit data delete( zy, zx, zz, ztj, zti, zkx, zky, zkz, zbuf ) finalize !!OpenMP compatibility broken. Possibility to use ifndef OpenMP + rename the file in trcadv.F90 to keep it deallocate(zy ) deallocate(zx ) @@ -956,24 +945,22 @@ SUBROUTINE trcadv deallocate(zkz ) deallocate(zbuf ) - !$acc exit data delete( zaa, zbb, zcc, inv_eu, inv_ev, inv_et, big_fact_zaa , big_fact_zbb, big_fact_zcc, zbtr_arr ) finalize if(use_gpu) - !$acc exit data delete( e1u, e2u, e3u, e1v, e2v, e3v, un, vn, wn ) finalize if(use_gpu) - trcadvparttime = MPI_WTIME() - trcadvparttime trcadvtottime = trcadvtottime + trcadvparttime !!!! contains +#ifndef _OPENACC double precision function fsx(pfx1, pfx2, pfu) !dir$ attributes vector :: fsx - !$acc routine seq IMPLICIT NONE double precision, INTENT(IN) :: pfx1, pfx2, pfu double precision :: abspfu abspfu = abs(pfu) fsx = ((pfu + abspfu) * pfx1 + (pfu - abspfu) * pfx2) * 0.5 end function fsx +#endif END SUBROUTINE trcadv From ccb2346d1a4a735813958bcaaaaa28d766a0106c Mon Sep 17 00:00:00 2001 From: Loris Lucido Date: Wed, 24 Apr 2024 15:32:51 +0200 Subject: [PATCH 07/33] fix serial kernel --- src/MPI/ogstm_mpi.f90 | 2 ++ src/PHYS/trcadv.f90 | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/MPI/ogstm_mpi.f90 b/src/MPI/ogstm_mpi.f90 index b6c0bcd4..f25a754a 100644 --- a/src/MPI/ogstm_mpi.f90 +++ b/src/MPI/ogstm_mpi.f90 @@ -413,6 +413,7 @@ SUBROUTINE mpplnk_my_openacc(ptab,gpu) !$acc kernels default(present) if(use_gpu) IF(nbondj.eq.0.or.nbondj.eq.1) THEN ! All but south boundary, we received from south + !$acc loop independent DO jw=1,SOUTH_count_recv ji = SOUTHpoints_recv(1,jw) jk = SOUTHpoints_recv(2,jw) @@ -423,6 +424,7 @@ SUBROUTINE mpplnk_my_openacc(ptab,gpu) IF(nbondj.eq.-1.or.nbondj.eq.0) THEN ! All but north boundary, we received from north + !$acc loop independent DO jw=1,NORTH_count_recv ji = NORTHpoints_recv(1,jw) jk = NORTHpoints_recv(2,jw) diff --git a/src/PHYS/trcadv.f90 b/src/PHYS/trcadv.f90 index 52862cf0..dc8b9096 100644 --- a/src/PHYS/trcadv.f90 +++ b/src/PHYS/trcadv.f90 @@ -865,7 +865,6 @@ SUBROUTINE trcadv !$acc end kernels endif - !$acc wait(queue) ENDDO ANTIDIFF_CORR From 9340275d704b6a5730ab69a39e63fb991ea164f7 Mon Sep 17 00:00:00 2001 From: Loris Lucido Date: Thu, 25 Jul 2024 15:59:34 +0200 Subject: [PATCH 08/33] more timers --- src/PHYS/trcadv.f90 | 49 ++++++++++++++++++++++++++++++++++++++++++++- src/PHYS/trchdf.f90 | 17 +++++++++++++--- 2 files changed, 62 insertions(+), 4 deletions(-) diff --git a/src/PHYS/trcadv.f90 b/src/PHYS/trcadv.f90 index dc8b9096..1dc1963e 100644 --- a/src/PHYS/trcadv.f90 +++ b/src/PHYS/trcadv.f90 @@ -26,6 +26,8 @@ SUBROUTINE trcadv use omp_lib USE ogstm_mpi_module + use simple_timer + implicit none !!! trcadv.smolar.h @@ -98,6 +100,8 @@ SUBROUTINE trcadv MPI_CHECK = .FALSE. + call tstart("trcadv_init") + if(.not.adv_initialized ) then ! INIT phase @@ -173,6 +177,9 @@ SUBROUTINE trcadv endif + call tstop("trcadv_init") + call tstart("trcadv_alloc") + !!OpenMP compatibility broken. Possibility to use ifndef OpenMP + rename the file in trcadv.F90 to keep it allocate(zy(jpk,jpj,jpi)) allocate(zx(jpk,jpj,jpi)) @@ -186,6 +193,8 @@ SUBROUTINE trcadv !$acc enter data create(zy,zx,zz,ztj,zti,zkx,zky,zkz,zbuf) + call tstop("trcadv_alloc") + !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! end initialization phase jk=1 @@ -210,6 +219,8 @@ SUBROUTINE trcadv !$acc update device(flx_ridxt(1:Fsize,1:4)) !$acc update device( diaflx(1:7, 1:Fsize, 1:jptra)) + call tstart("trcadv_1") + !$acc kernels default(present) async(queue) DO ji = 1,jpi DO jj = 1,jpj @@ -368,9 +379,12 @@ SUBROUTINE trcadv !$omp shared(jpim1,jpjm1,un,vn,wn,e2u,e3u,e3v,e1v,e1t,e2t,e3t,trn,advmask,jarr3,jarr_adv_flx,zbtr_arr) & !$omp firstprivate(jpkm1,dimen_jarr3,Fsize,ncor,rtrn,rsc,dimen_jarrt,jpj,jpi,jpk) + call tstop("trcadv_1") + call tstart("trcadv_tracer") TRACER_LOOP: DO jn = 1, jptra + call tstart("trcadv_tracer_1") !! 1. tracer flux in the 3 directions !! ---------------------------------- @@ -475,6 +489,9 @@ SUBROUTINE trcadv !$acc end parallel loop !$acc wait(queue) + call tstop("trcadv_tracer_1") + call tstart("trcadv_tracer_1_mpi") + ! ... Lateral boundary conditions on zk[xy] #ifdef key_mpp @@ -497,6 +514,8 @@ SUBROUTINE trcadv CALL lbc( zky(:,:,:), 1, 1, 1, 1, jpk, 1, gpu=use_gpu ) #endif + call tstop("trcadv_tracer_1_mpi") + call tstart("trcadv_tracer_2") !! 2. calcul of after field using an upstream advection scheme !! ----------------------------------------------------------- @@ -528,11 +547,15 @@ SUBROUTINE trcadv !$acc wait(queue) + call tstop("trcadv_tracer_2") + call tstart("trcadv_antidiffcorr") !! 2.1 start of antidiffusive correction loop ANTIDIFF_CORR: DO jt = 1,ncor + call tstart("trcadv_antidiffcorr_1") + !! 2.2 calcul of intermediary field zti @@ -606,6 +629,9 @@ SUBROUTINE trcadv !$acc wait(queue) + call tstop("trcadv_antidiffcorr_1") + call tstart("trcadv_antidiffcorr_1_mpi") + !! ... Lateral boundary conditions on zti #ifdef key_mpp ! ... Mpp : export boundary values to neighboring processors @@ -619,6 +645,8 @@ SUBROUTINE trcadv CALL lbc( zti(:,:,:), 1, 1, 1, 1, jpk, 1, gpu=use_gpu ) #endif + call tstop("trcadv_antidiffcorr_1_mpi") + call tstart("trcadv_antidiffcorr_2") !! 2.3 calcul of the antidiffusive flux @@ -665,6 +693,9 @@ SUBROUTINE trcadv !$acc wait(queue) + call tstop("trcadv_antidiffcorr_2") + call tstart("trcadv_antidiffcorr_2_mpi") + ! ... Lateral boundary conditions on z[xyz] #ifdef key_mpp @@ -686,6 +717,9 @@ SUBROUTINE trcadv CALL lbc( zz(:,:,:), 1, 1, 1, 1, jpk, 1, gpu=use_gpu ) #endif + call tstop("trcadv_antidiffcorr_2_mpi") + call tstart("trcadv_antidiffcorr_3") + !! 2.4 reinitialization !! 2.5 calcul of the final field: !! advection by antidiffusive mass fluxes and an upstream scheme @@ -794,6 +828,9 @@ SUBROUTINE trcadv !$acc wait(queue) + call tstop("trcadv_antidiffcorr_3") + call tstart("trcadv_antidiffcorr_3_mpi") + !... Lateral boundary conditions on zk[xy] #ifdef key_mpp ! ... Mpp : export boundary values to neighboring processors @@ -812,7 +849,8 @@ SUBROUTINE trcadv !! 2.6. calcul of after field using an upstream advection scheme - + call tstop("trcadv_antidiffcorr_3_mpi") + call tstart("trcadv_antidiffcorr_4") if(ncor .EQ. 1) then !$acc kernels default(present) async(queue) @@ -866,8 +904,13 @@ SUBROUTINE trcadv endif + call tstop("trcadv_antidiffcorr_4") + ENDDO ANTIDIFF_CORR + call tstop("trcadv_antidiffcorr") + call tstart("trcadv_tracer_3") + !! 3. trend due to horizontal and vertical advection of tracer jn @@ -901,6 +944,8 @@ SUBROUTINE trcadv !$acc wait(queue) + call tstop("trcadv_tracer_3") + !!$ !!OpenMP compatibility broken. Possibility to use ifdef OpenMP + rename the file in trcadv.F90 to keep it !!$ deallocate(zy ) !!$ deallocate(zx ) @@ -917,6 +962,8 @@ SUBROUTINE trcadv END DO TRACER_LOOP !$acc wait(queue) + call tstop("trcadv_tracer") + !$OMP end taskloop !$acc update host( zaa(1:jpk,1:jpj,1:jpi), zbb(1:jpk,1:jpj,1:jpi), zcc(1:jpk,1:jpj,1:jpi) ) diff --git a/src/PHYS/trchdf.f90 b/src/PHYS/trchdf.f90 index a70bfc9d..7c25122d 100644 --- a/src/PHYS/trchdf.f90 +++ b/src/PHYS/trchdf.f90 @@ -92,7 +92,9 @@ SUBROUTINE trchdf USE DIA_mem use mpi - + + use simple_timer + IMPLICIT NONE !!---------------------------------------------------------------------- !! local declarations @@ -113,6 +115,8 @@ SUBROUTINE trchdf !! statement functions !! =================== + call tstart("trchdf_1") + ! #include "BFM_var_list.h" trcbilaphdfparttime = MPI_WTIME() @@ -192,6 +196,9 @@ SUBROUTINE trchdf !! tracer slab !! ============= + call tstop("trchdf_1") + call tstart("trchdf_tracer") + ! $omp taskloop default(none) private(jv,jk,jj,ji) & ! $omp private(jn,ztu,ztv,zlt) firstprivate(jpi,jpj,jpk,trcrat) & ! $omp shared(zeeu,trb,tmask,zeev) & @@ -326,7 +333,7 @@ SUBROUTINE trchdf ! jk = jarr_hdf(1,jv,2) ! jf = jarr_hdf_flx(jv) - ! $OMP TASKWAIT + ! $OMP TASKWAIT DO ji = 2,jpim1 DO jj = 2,jpjm1 DO jk = 1,jpk @@ -341,7 +348,6 @@ SUBROUTINE trchdf END DO END DO - DO jf=1,Fsize jk = flx_ridxt(jf,2) jj = flx_ridxt(jf,3) @@ -360,6 +366,9 @@ SUBROUTINE trchdf END DO TRACER_LOOP ! $OMP END TASKLOOP + call tstop("trchdf_tracer") + call tstart("trchdf_2") + ! deallocate(hdfmask) ! deallocate(zeeu) ! deallocate(zeev) @@ -373,4 +382,6 @@ SUBROUTINE trchdf trcbilaphdfparttime = MPI_WTIME() - trcbilaphdfparttime trcbilaphdftottime = trcbilaphdftottime + trcbilaphdfparttime + call tstop("trchdf_2") + END SUBROUTINE trchdf From bf39f7d77eced497cb844c4dc8d91ad75246b62a Mon Sep 17 00:00:00 2001 From: Loris Lucido Date: Tue, 7 May 2024 11:35:10 +0200 Subject: [PATCH 09/33] remove duplicated mpplnk_my_openacc routine, just use mpplnk_my --- src/MPI/ogstm_mpi.f90 | 163 +----------------------------------------- src/PHYS/trcadv.f90 | 37 +++------- 2 files changed, 9 insertions(+), 191 deletions(-) diff --git a/src/MPI/ogstm_mpi.f90 b/src/MPI/ogstm_mpi.f90 index f25a754a..1cf8a3e2 100644 --- a/src/MPI/ogstm_mpi.f90 +++ b/src/MPI/ogstm_mpi.f90 @@ -118,168 +118,7 @@ SUBROUTINE mynode !! nono : number for local neighboring processors !! !!---------------------------------------------------------------------- - SUBROUTINE mpplnk_my(ptab) - - double precision ptab(jpk,jpj,jpi) - - -#ifdef key_mpp_mpi - - INTEGER jk,jj,ji - INTEGER reqs1, reqs2, reqr1, reqr2 - INTEGER reqs3, reqs4, reqr3, reqr4 - INTEGER jw, packsize - -!! trcadvparttime = MPI_WTIME() - -!! -!!2. East and west directions exchange -!!------------------------------------ - - - -!! -!!2.2 Migrations -!! -!! -! 3 4 -! | ^ -! | | -! v | -! ________________ -! | | -! 1<-- | | 1 <-- -! 2--> | | 2 --> -! |________________| -! 3 4 -! | ^ -! | | -! v | - - packsize=jpk*jpj - - IF(nbondi.eq.-1) THEN ! We are at the west side of the domain - - CALL mppsend(2,ptab(:,:,jpi-1),packsize,noea,0,reqs1) - CALL mpprecv(1,ptab(:,:, jpi),packsize,reqr1) - - ELSE IF(nbondi.eq.0) THEN - CALL mppsend(1, ptab(:,: ,2),packsize,nowe,0,reqs1) - CALL mppsend(2, ptab(:,:,jpi-1),packsize,noea,0,reqs2) - - CALL mpprecv(1,ptab(:,:,jpi),packsize,reqr1) - CALL mpprecv(2,ptab(:,:, 1),packsize,reqr2) - - ELSE IF(nbondi.eq.1) THEN ! We are at the east side of the domain - - CALL mppsend(1,ptab(:,:,2), packsize, nowe,0, reqs1) - CALL mpprecv(2,ptab(:,:,1), packsize, reqr1) - - - ENDIF - - -!! -!! -!!3. North and south directions -!!----------------------------- -!! -!!3.1 Read Dirichlet lateral conditions -!! - - - IF(nbondj.eq.0.or.nbondj.eq.-1) THEN - DO jw=1,NORTH_count_send - ji = NORTHpoints_send(1,jw) - jk = NORTHpoints_send(2,jw) - tn_send(jw) = ptab(jk,jpj-1,ji) - ENDDO - ENDIF - IF(nbondj.eq.0.or.nbondj.eq.1) THEN - DO jw=1,SOUTH_count_send - ji = SOUTHpoints_send(1,jw) - jk = SOUTHpoints_send(2,jw) - ts_send(jw) = ptab(jk,2,ji) - ENDDO - - - ENDIF! PACK_LOOP4 - - -!! -!!2.2 Migrations -!! -!! - - IF(nbondj.eq.-1) THEN ! We are at the south side of the domain - CALL mppsend(4,tn_send,NORTH_count_send,nono,0,reqs4) - CALL mpprecv(3,tn_recv,NORTH_count_recv,reqr3) - CALL mppwait(reqs4) - CALL mppwait(reqr3) - ELSE IF(nbondj.eq.0) THEN - CALL mppsend(4, tn_send,NORTH_count_send,nono,0,reqs4) - CALL mppsend(3, ts_send,SOUTH_count_send,noso,0,reqs3) - CALL mpprecv(3,tn_recv,NORTH_count_recv,reqr3) - CALL mpprecv(4,ts_recv,SOUTH_count_recv,reqr4) - - CALL mppwait(reqs4) - CALL mppwait(reqs3) - CALL mppwait(reqr3) - CALL mppwait(reqr4) - ELSE IF(nbondj.eq.1) THEN ! We are at the north side of the domain - CALL mppsend(3,ts_send, SOUTH_count_send, noso,0, reqs3) - CALL mpprecv(4,ts_recv, SOUTH_count_recv, reqr4) - CALL mppwait(reqs3) - CALL mppwait(reqr4) - ENDIF - - - -!! -!!2.3 Write Dirichlet lateral conditions -!! - - IF(nbondj.eq.0.or.nbondj.eq.1) THEN ! All but south boundary, we received from south - - DO jw=1,SOUTH_count_recv - ji = SOUTHpoints_recv(1,jw) - jk = SOUTHpoints_recv(2,jw) - ptab(jk,1,ji)= ts_recv(jw) - ENDDO - - ENDIF - - IF(nbondj.eq.-1.or.nbondj.eq.0) THEN ! All but north boundary, we received from north - - DO jw=1,NORTH_count_recv - ji = NORTHpoints_recv(1,jw) - jk = NORTHpoints_recv(2,jw) - ptab(jk,jpj,ji)= tn_recv(jw) - ENDDO - - ENDIF ! PACK_LOOP5 - - -!!! East - West waits - - IF(nbondi.eq.-1) THEN ! We are at the west side of the domain - CALL mppwait(reqs1) - CALL mppwait(reqr1) - ELSE IF(nbondi.eq.0) THEN - CALL mppwait(reqs1) - CALL mppwait(reqs2) - CALL mppwait(reqr1) - CALL mppwait(reqr2) - ELSE IF(nbondi.eq.1) THEN ! We are at the east side of the domain - CALL mppwait(reqs1) - CALL mppwait(reqr1) - ENDIF - -#endif - -END SUBROUTINE - -SUBROUTINE mpplnk_my_openacc(ptab,gpu) +SUBROUTINE mpplnk_my(ptab,gpu) double precision ptab(jpk,jpj,jpi) #ifdef key_mpp_mpi diff --git a/src/PHYS/trcadv.f90 b/src/PHYS/trcadv.f90 index 1dc1963e..156e1381 100644 --- a/src/PHYS/trcadv.f90 +++ b/src/PHYS/trcadv.f90 @@ -497,14 +497,8 @@ SUBROUTINE trcadv ! ... Mpp : export boundary values to neighboring processors -#ifndef _OPENACC - CALL mpplnk_my(zkx) - CALL mpplnk_my(zky) -#else - CALL mpplnk_my_openacc(zkx,gpu=use_gpu) - CALL mpplnk_my_openacc(zky,gpu=use_gpu) - -#endif + CALL mpplnk_my(zkx,gpu=use_gpu) + CALL mpplnk_my(zky,gpu=use_gpu) #else @@ -635,11 +629,7 @@ SUBROUTINE trcadv !! ... Lateral boundary conditions on zti #ifdef key_mpp ! ... Mpp : export boundary values to neighboring processors -#ifndef _OPENACC - CALL mpplnk_my(zti) -#else - CALL mpplnk_my_openacc(zti,gpu=use_gpu) -#endif + CALL mpplnk_my(zti,gpu=use_gpu) #else ! ... T-point, 3D array, full local array zti is initialised CALL lbc( zti(:,:,:), 1, 1, 1, 1, jpk, 1, gpu=use_gpu ) @@ -700,15 +690,9 @@ SUBROUTINE trcadv #ifdef key_mpp ! ... Mpp : export boundary values to neighboring processors -#ifndef _OPENACC - CALL mpplnk_my(zx) - CALL mpplnk_my(zy) - CALL mpplnk_my(zz) -#else - CALL mpplnk_my_openacc(zx,gpu=use_gpu) - CALL mpplnk_my_openacc(zy,gpu=use_gpu) - CALL mpplnk_my_openacc(zz,gpu=use_gpu) -#endif + CALL mpplnk_my(zx,gpu=use_gpu) + CALL mpplnk_my(zy,gpu=use_gpu) + CALL mpplnk_my(zz,gpu=use_gpu) #else ! ... T-point, 3D array, full local array z[xyz] are initialised @@ -834,13 +818,8 @@ SUBROUTINE trcadv !... Lateral boundary conditions on zk[xy] #ifdef key_mpp ! ... Mpp : export boundary values to neighboring processors -#ifndef _OPENACC - CALL mpplnk_my(zkx) - CALL mpplnk_my(zky) -#else - CALL mpplnk_my_openacc(zkx,gpu=use_gpu) - CALL mpplnk_my_openacc(zky,gpu=use_gpu) -#endif + CALL mpplnk_my(zkx,gpu=use_gpu) + CALL mpplnk_my(zky,gpu=use_gpu) #else ! ... T-point, 3D array, full local array zk[xy] are initialised CALL lbc( zkx(:,:,:), 1, 1, 1, 1, jpk, 1, gpu=use_gpu ) From 43ff97397e988a679d8117e4972c90eaa8f6be88 Mon Sep 17 00:00:00 2001 From: Loris Lucido Date: Tue, 7 May 2024 11:35:44 +0200 Subject: [PATCH 10/33] port trchdf on GPU --- src/General/memory.f90 | 8 +++--- src/General/step.f90 | 6 +++-- src/PHYS/trchdf.f90 | 58 ++++++++++++++++++++++++++++++++++-------- 3 files changed, 55 insertions(+), 17 deletions(-) diff --git a/src/General/memory.f90 b/src/General/memory.f90 index e8497bd9..c58c5f87 100644 --- a/src/General/memory.f90 +++ b/src/General/memory.f90 @@ -791,8 +791,8 @@ subroutine alloc_tot() DAY_LENGTH = huge(DAY_LENGTH(1,1)) forcing_phys_initialized = .false. - !$acc enter data create(e1t,e2t,e3t,e3w,e3t_back,tra,trb,tmask,avt,& - !$acc& e1u,e2u,e3u,e1v,e2v,e3v,un,vn,wn,trn) + !$acc enter data create(e1t,e2t,e3t,e3w,e3t_back,tra,trb,tmask,umask,& + !$acc& vmask,avt,e1u,e2u,e3u,e1v,e2v,e3v,un,vn,wn,trn,ahtt) #ifdef Mem_Monitor mem_all=get_mem(err) - aux_mem @@ -806,7 +806,7 @@ subroutine clean_memory() ! myalloc (memory.f90) - !$acc exit data delete(e1t,e2t,e3t,e3w,e3t_back,tra,trb,tmask,avt) + !$acc exit data delete(e1t,e2t,e3t,e3w,e3t_back,tra,trb,tmask,umask,vmask,avt) #ifdef key_mpp @@ -1001,7 +1001,7 @@ subroutine clean_memory() deallocate(highfreq_table_dia) deallocate(highfreq_table_dia2d) - !$acc exit data delete(trn, e1u, e2u, e3u, e1v, e2v, e3v, un, vn, wn) + !$acc exit data delete(trn, e1u, e2u, e3u, e1v, e2v, e3v, un, vn, wn, ahtt) end subroutine clean_memory diff --git a/src/General/step.f90 b/src/General/step.f90 index 13402e71..79d7b1fa 100644 --- a/src/General/step.f90 +++ b/src/General/step.f90 @@ -291,8 +291,8 @@ SUBROUTINE trcstp ! with IMPLICIT vertical diffusion ! XXX: to be removed - use DIA_mem, only: diaflx - use myalloc, only: tra,trb,e1t,e3t_back,e2t,e3t,e3w,tmask,avt + use DIA_mem, only: diaflx,flx_ridxt + use myalloc, only: tra,trb,e1t,e3t_back,e2t,e3t,e3w,umask,vmask,tmask,avt,ahtt use simple_timer IMPLICIT NONE @@ -326,7 +326,9 @@ SUBROUTINE trcstp ! ----------------------------- call tstart("trchdf") + !$acc update device(umask,vmask,tmask,trb,ahtt,tra,diaflx,flx_ridxt) if(lhdf) IF (lhdf) CALL trchdf + !$acc update host(diaflx,tra) if(lhdf) call tstop("trchdf") ! tracers: sink and source (must be parallelized on vertical slab) diff --git a/src/PHYS/trchdf.f90 b/src/PHYS/trchdf.f90 index 7c25122d..e571057d 100644 --- a/src/PHYS/trchdf.f90 +++ b/src/PHYS/trchdf.f90 @@ -111,12 +111,20 @@ SUBROUTINE trchdf INTEGER :: locsum,jklef,jjlef,jilef,jkrig,jjrig,jirig !INTEGER, allocatable :: jarr_hdf(:,:,:),jarr_hdf_flx(:) double precision, allocatable,dimension(:,:,:) :: zlt, ztu, ztv + integer :: queue + logical :: use_gpu !!---------------------------------------------------------------------- !! statement functions !! =================== call tstart("trchdf_1") +#ifdef _OPENACC + use_gpu=.true. +#else + use_gpu=.false. +#endif + ! #include "BFM_var_list.h" trcbilaphdfparttime = MPI_WTIME() @@ -136,8 +144,9 @@ SUBROUTINE trchdf allocate(zeev (jpk,jpj,jpi )) zeev = huge(zeev(1,1,1)) allocate(zbtr (jpk,jpj,jpi )) - zbtr = huge(zbtr(1,1,1)) + zbtr = huge(zbtr(1,1,1)) + !$acc enter data create(hdfmask,zeeu,zeev,zbtr) DO ji = 1,jpi DO jj = 1,jpj @@ -172,10 +181,20 @@ SUBROUTINE trchdf END DO END DO + queue=1 + !$acc update device(hdfmask) hdf_initialized=.true. + ENDIF + + allocate(zlt (jpk,jpj,jpi)) + allocate(ztu (jpk,jpj,jpi)) + allocate(ztv (jpk,jpj,jpi)) + !$acc enter data create(zlt,ztu,ztv) + ! Metric arrays calculated out of the initialisation phase(for z- or s-coordinates) ! !! ---------------------------------- + !$acc parallel loop gang vector collapse(3) default(present) async(queue) DO ji = 1, jpi DO jj = 1, jpj DO jk=1,jpk @@ -186,13 +205,11 @@ SUBROUTINE trchdf END DO END DO END DO + !$acc end parallel loop - allocate(zlt (jpk,jpj,jpi)) - allocate(ztu (jpk,jpj,jpi)) - allocate(ztv (jpk,jpj,jpi)) !! tracer slab !! ============= @@ -206,9 +223,11 @@ SUBROUTINE trchdf TRACER_LOOP: DO jn = 1, jptra + !$acc kernels default(present) async(queue) zlt = 0. ztu = 0. ztv = 0. + !$acc end kernels !! 1. Laplacian !! ------------ @@ -222,6 +241,7 @@ SUBROUTINE trchdf ! jk = jarr_hdf(1,jv,1a) ! $OMP TASK default(shared) private(ji,jj,jk) + !$acc parallel loop gang vector collapse(3) default(present) async(queue) DO ji = 1,jpi-1 DO jj = 1,jpj-1 DO jk = 1,jpk @@ -235,8 +255,10 @@ SUBROUTINE trchdf END DO END DO ! $OMP END TASK + !$acc end parallel loop ! $OMP TASK default(shared) private(ji,jj,jk) + !$acc parallel loop gang vector collapse(3) default(present) async(queue) DO ji = 1,jpi-1 DO jj = 1,jpj-1 DO jk = 1,jpk @@ -249,6 +271,7 @@ SUBROUTINE trchdf END DO END DO END DO + !$acc end parallel loop ! $OMP END TASK ! $OMP TASKWAIT @@ -261,6 +284,7 @@ SUBROUTINE trchdf ! ji = jarr_hdf(3,jv,2) ! jj = jarr_hdf(2,jv,2) ! jk = jarr_hdf(1,jv,2) + !$acc parallel loop gang vector collapse(3) default(present) async(queue) DO ji = 2,jpi-1 DO jj = 2,jpj-1 DO jk = 1,jpk @@ -273,6 +297,7 @@ SUBROUTINE trchdf END DO END DO END DO + !$acc end parallel loop @@ -281,11 +306,12 @@ SUBROUTINE trchdf !! ... Lateral boundary conditions on the laplacian (zlt,zls) + !$acc wait(queue) #ifdef key_mpp ! ... Mpp : export boundary values to neighboring processors - - CALL mpplnk_my(zlt) + CALL mpplnk_my(zlt,gpu=use_gpu) #else +#error CALL lbc( zlt(:,:,:), 1, 1, 1, 1, jpk, 1 ) #endif @@ -298,6 +324,7 @@ SUBROUTINE trchdf !!!&omp& dimen_jvhdf3,zta,zbtr,tra,jarr_hdf_flx,diaflx,Fsize) ! $OMP TASK default(shared) private(ji,jj,jk) + !$acc parallel loop gang vector collapse(3) default(present) async(queue) DO ji = 1,jpi-1 DO jj = 1,jpj-1 DO jk = 1,jpk @@ -308,9 +335,11 @@ SUBROUTINE trchdf END DO END DO END DO + !$acc end parallel loop ! $OMP END TASK ! $OMP TASK default(shared) private(ji,jj,jk) + !$acc parallel loop gang vector collapse(3) default(present) async(queue) DO ji = 1,jpi-1 DO jj = 1,jpj-1 DO jk = 1,jpk @@ -322,6 +351,7 @@ SUBROUTINE trchdf END DO END DO END DO + !$acc end parallel loop ! $OMP END TASK !! ... fourth derivative (divergence) and add to the general tracer trend @@ -334,6 +364,7 @@ SUBROUTINE trchdf ! jf = jarr_hdf_flx(jv) ! $OMP TASKWAIT + !$acc parallel loop gang vector collapse(3) default(present) async(queue) DO ji = 2,jpim1 DO jj = 2,jpjm1 DO jk = 1,jpk @@ -346,8 +377,10 @@ SUBROUTINE trchdf !tra(jk,jj,ji,jn ) = tra(jk,jj,ji,jn ) + zta END DO END DO - END DO + END DO + !$acc end parallel loop + !$acc parallel loop gang vector default(present) async(queue) DO jf=1,Fsize jk = flx_ridxt(jf,2) jj = flx_ridxt(jf,3) @@ -357,6 +390,7 @@ SUBROUTINE trchdf diaflx(6,jf, jn) = diaflx(6,jf, jn) - ztv(jk,jj,ji)*rdt ENDDO + !$acc end parallel loop @@ -365,6 +399,7 @@ SUBROUTINE trchdf END DO TRACER_LOOP ! $OMP END TASKLOOP + !$acc wait(queue) call tstop("trchdf_tracer") call tstart("trchdf_2") @@ -374,10 +409,11 @@ SUBROUTINE trchdf ! deallocate(zeev) ! deallocate(zbtr) - deallocate(zlt) - deallocate(ztu) - deallocate(ztv) - + !$acc exit data delete(zlt,ztu,ztv) + deallocate(zlt) + deallocate(ztu) + deallocate(ztv) + trcbilaphdfparttime = MPI_WTIME() - trcbilaphdfparttime trcbilaphdftottime = trcbilaphdftottime + trcbilaphdfparttime From 0772e24b6689667ebc5b1e73a5e8202d2f9e4de7 Mon Sep 17 00:00:00 2001 From: Loris Lucido Date: Tue, 7 May 2024 15:31:38 +0200 Subject: [PATCH 11/33] mpplnk_my timer --- src/MPI/ogstm_mpi.f90 | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/MPI/ogstm_mpi.f90 b/src/MPI/ogstm_mpi.f90 index 1cf8a3e2..b1bbf0ed 100644 --- a/src/MPI/ogstm_mpi.f90 +++ b/src/MPI/ogstm_mpi.f90 @@ -19,6 +19,8 @@ MODULE ogstm_mpi_module use petscvec, only: PETSC_COMM_WORLD, PETSC_NULL_CHARACTER #endif +use simple_timer + implicit NONE @@ -130,6 +132,8 @@ SUBROUTINE mpplnk_my(ptab,gpu) logical,optional :: gpu logical :: use_gpu + call tstart("mpplnk_my") + use_gpu=.false. if(present(gpu)) use_gpu=gpu @@ -290,6 +294,8 @@ SUBROUTINE mpplnk_my(ptab,gpu) #endif + call tstop("mpplnk_my") + END SUBROUTINE From 9d71a04811f6d9267ef763939fb8d7171041ca51 Mon Sep 17 00:00:00 2001 From: Loris Lucido Date: Tue, 7 May 2024 17:33:52 +0200 Subject: [PATCH 12/33] trcave GPU port --- src/BIO/trclec.f90 | 3 +++ src/General/memory.f90 | 10 +++++++-- src/PHYS/trcave.f90 | 47 +++++++++++++++++++++++++++++++++++------- src/PHYS/trczdf.f90 | 3 ++- 4 files changed, 53 insertions(+), 10 deletions(-) diff --git a/src/BIO/trclec.f90 b/src/BIO/trclec.f90 index d6af01f4..b99be8a3 100644 --- a/src/BIO/trclec.f90 +++ b/src/BIO/trclec.f90 @@ -141,6 +141,7 @@ SUBROUTINE trclec if (ctr_hf(ji).eq.1) jptra_high = jptra_high + 1 enddo allocate(highfreq_table(jptra_HIGH)) + !$acc enter data create(highfreq_table) highfreq_table = huge(highfreq_table(1)) jptra_high = 0 @@ -165,6 +166,7 @@ SUBROUTINE trclec if (lwp) write(*,*) 'High freq diagnostics number :', jptra_dia_HIGH allocate(highfreq_table_dia(jptra_dia_HIGH)) + !$acc enter data create(highfreq_table_dia) jptra_dia_high = 0 @@ -188,6 +190,7 @@ SUBROUTINE trclec if (lwp) write(*,*) 'High freq diagnostics number 2d:', jptra_dia2d_HIGH allocate(highfreq_table_dia2d(jptra_dia2d_HIGH)) + !$acc enter data create(highfreq_table_dia2d) jptra_dia2d_high = 0 diff --git a/src/General/memory.f90 b/src/General/memory.f90 index c58c5f87..96db7c26 100644 --- a/src/General/memory.f90 +++ b/src/General/memory.f90 @@ -792,7 +792,10 @@ subroutine alloc_tot() forcing_phys_initialized = .false. !$acc enter data create(e1t,e2t,e3t,e3w,e3t_back,tra,trb,tmask,umask,& - !$acc& vmask,avt,e1u,e2u,e3u,e1v,e2v,e3v,un,vn,wn,trn,ahtt) + !$acc& vmask,avt,e1u,e2u,e3u,e1v,e2v,e3v,un,vn,wn,trn,ahtt,traIO,traIO_HIGH,& + !$acc& snIO,tnIO,wnIO,avtIO,e3tIO,unIO,vnIO,vatmIO,empIO,qsrIO,sn,tn,& + !$acc& tra_DIA,tra_DIA_IO,tra_DIA_2d_IO,tra_DIA_2d,tra_DIA_IO_HIGH,& + !$acc& vatm,emp,qsr,tra_DIA_2d_IO_HIGH) #ifdef Mem_Monitor mem_all=get_mem(err) - aux_mem @@ -997,11 +1000,14 @@ subroutine clean_memory() ! trclec + !$acc exit data delete(highfreq_table,highfreq_table_dia,highfreq_table_dia2d) deallocate(highfreq_table) deallocate(highfreq_table_dia) deallocate(highfreq_table_dia2d) - !$acc exit data delete(trn, e1u, e2u, e3u, e1v, e2v, e3v, un, vn, wn, ahtt) + !$acc exit data delete(trn, e1u, e2u, e3u, e1v, e2v, e3v, un, vn, wn,& + !$acc& ahtt, traio,traIO_HIGH,snIO,tnIO,wnIO,avtIO,e3tIO,unIO,vnIO,vatmIO,empIO,qsrIO,sn,tn,& + !$acc& tra_DIA,tra_DIA_IO,tra_DIA_2d_IO,tra_DIA_2d,tra_DIA_IO_HIGH,vatm,emp,qsr,tra_DIA_2d_IO_HIGH) end subroutine clean_memory diff --git a/src/PHYS/trcave.f90 b/src/PHYS/trcave.f90 index 8d8313a5..617a0212 100644 --- a/src/PHYS/trcave.f90 +++ b/src/PHYS/trcave.f90 @@ -10,14 +10,19 @@ SUBROUTINE trcave integer :: jn_high, jn_on_all double precision :: Miss_val =1.e20 double precision :: elapsed_time, inv_incremented_time + integer :: queue ave_partTime = MPI_WTIME() + queue=1 ! FIRST, LOW FREQUENCY elapsed_time = elapsed_time_2 inv_incremented_time = 1./(elapsed_time_2 + rdt) + !$acc update device(traIO,trn,umask,vmask,tmask,traIO_HIGH,highfreq_table,snIO,tnIO,wnIO,avtIO,e3tIO,unIO,vnIO,sn,tn,wn,avt,e3t,un,vn,tra_DIA_IO,tra_DIA,tra_DIA_2d_IO,tra_DIA_2d,vatmIO,empIO,qsrIO,vatm,emp,qsr,highfreq_table_dia,tra_DIA_IO_HIGH,tra_DIA_2d_IO_HIGH,highfreq_table_dia2d) + + !$acc parallel loop gang vector collapse(4) default(present) async(queue) DO jn=1 ,jptra DO ji=1, jpi @@ -33,6 +38,7 @@ SUBROUTINE trcave END DO END DO + !$acc end parallel loop @@ -40,15 +46,21 @@ SUBROUTINE trcave elapsed_time = elapsed_time_1 inv_incremented_time = 1./(elapsed_time_1 + rdt)! ****************** HIGH FREQUENCY + !$acc parallel loop gang vector collapse(4) default(present) async(queue) DO jn_high=1 ,jptra_high +#ifndef _OPENACC jn_on_all = highfreq_table(jn_high) +#endif DO ji=1, jpi DO jj=1, jpj DO jk=1, jpk +#ifdef _OPENACC + jn_on_all = highfreq_table(jn_high) +#endif IF(tmask(jk,jj,ji) .NE. 0.) THEN traIO_HIGH(jk,jj,ji,jn_high )= & & (traIO_HIGH(jk,jj,ji,jn_high )*elapsed_time+trn(jk,jj,ji,jn_on_all)*rdt)*inv_incremented_time @@ -61,6 +73,7 @@ SUBROUTINE trcave END DO + !$acc end parallel loop ! ***************** PHYS ***************************************************** @@ -73,6 +86,7 @@ SUBROUTINE trcave endif + !$acc parallel loop gang vector collapse(3) default(present) async(queue) DO ji=1, jpi DO jj=1, jpj DO jk=1, jpk @@ -107,7 +121,9 @@ SUBROUTINE trcave END DO END DO END DO + !$acc end parallel loop + !$acc parallel loop gang vector collapse(2) default(present) async(queue) DO jj=1, jpj DO ji=1, jpi IF (tmask(1,jj,ji) .NE. 0.) THEN @@ -121,6 +137,7 @@ SUBROUTINE trcave ENDIF END DO END DO + !$acc end parallel loop ! ***************** END PHYS ************************************************* @@ -135,6 +152,7 @@ SUBROUTINE trcave inv_incremented_time = 1./(elapsed_time_2 + rdt) + !$acc parallel loop gang vector collapse(4) default(present) async(queue) DO jn = 1,jptra_dia DO ji=1, jpi DO jj=1, jpj @@ -148,10 +166,12 @@ SUBROUTINE trcave END DO END DO ENDDO + !$acc end parallel loop ! ********************* DIAGNOSTICS 2D ********** + !$acc parallel loop gang vector collapse(2) default(present) async(queue) DO ji=1, jpi DO jj=1, jpj IF(tmask(1,jj,ji) .NE. 0.) THEN ! Warning ! Tested only for surface @@ -162,6 +182,7 @@ SUBROUTINE trcave ENDIF END DO END DO + !$acc end parallel loop @@ -172,11 +193,17 @@ SUBROUTINE trcave if (jptra_dia_high.gt.0) THEN + !$acc parallel loop gang vector collapse(4) default(present) async(queue) DO jn_high=1, jptra_dia_high +#ifndef _OPENACC jn_on_all = highfreq_table_dia(jn_high ) +#endif DO ji=1, jpi DO jj=1, jpj DO jk=1, jpk +#ifdef _OPENACC + jn_on_all = highfreq_table_dia(jn_high ) +#endif IF(tmask(jk,jj,ji) .NE. 0.) THEN tra_DIA_IO_HIGH(jk,jj,ji,jn_high )= & & (tra_DIA_IO_HIGH(jk,jj,ji,jn_high )*elapsed_time+tra_DIA(jk,jj,ji,jn_on_all)*rdt)*inv_incremented_time @@ -187,6 +214,7 @@ SUBROUTINE trcave END DO END DO END DO + !$acc end parallel loop endif @@ -194,24 +222,29 @@ SUBROUTINE trcave if (jptra_dia2d_high.gt.0) THEN + !$acc parallel loop gang vector collapse(3) default(present) async(queue) DO ji=1, jpi DO jj=1, jpj - IF(tmask(1,jj,ji) .NE. 0.) THEN DO jn_high=1, jptra_dia2d_high - jn_on_all = highfreq_table_dia2d(jn_high) - tra_DIA_2d_IO_HIGH(jn_high,jj,ji)= & - & (tra_DIA_2d_IO_HIGH(jn_high,jj,ji)*elapsed_time+tra_DIA_2d(jn_on_all,jj,ji)*rdt)*inv_incremented_time + IF(tmask(1,jj,ji) .NE. 0.) THEN + jn_on_all = highfreq_table_dia2d(jn_high) + tra_DIA_2d_IO_HIGH(jn_high,jj,ji)= & + & (tra_DIA_2d_IO_HIGH(jn_high,jj,ji)*elapsed_time+tra_DIA_2d(jn_on_all,jj,ji)*rdt)*inv_incremented_time + ELSE + tra_DIA_2d_IO_HIGH(jn_high,jj,ji)=Miss_val + ENDIF END DO - ELSE - tra_DIA_2d_IO_HIGH(:,jj,ji)=Miss_val - ENDIF END DO END DO + !$acc end parallel loop endif endif ! lfbm + !$acc wait(queue) + !$acc update host(traIO,traIO_HIGH,snIO,tnIO,wnIO,avtIO,e3tIO,unIO,vnIO,vatmIO,empIO,qsrIO,tra_DIA_IO,tra_DIA_2d_IO,tra_DIA_2d,tra_DIA_IO_HIGH,tra_DIA_2d_IO_HIGH) + ave_partTime = MPI_WTIME() - ave_partTime ave_TotTime = ave_TotTime + ave_partTime diff --git a/src/PHYS/trczdf.f90 b/src/PHYS/trczdf.f90 index a1b02ee3..76de5ef0 100644 --- a/src/PHYS/trczdf.f90 +++ b/src/PHYS/trczdf.f90 @@ -133,7 +133,8 @@ SUBROUTINE trczdf ztavg = 0.e0 !! vertical slab - ! NOTE: kernel is too big, should be split + ! NOTE: kernel is too big, should be split by adding a new jv dimension + ! on zwi zws zwd zwy zwt zwz zwx !$acc parallel loop gang vector default(present) async vector_length(32) DO jv = 1, dimen_jvzdf From c3fc5a1b56df8aae068b8d2fea53a55c7f902886 Mon Sep 17 00:00:00 2001 From: Loris Lucido Date: Mon, 13 May 2024 10:48:35 +0200 Subject: [PATCH 13/33] remove debug statement --- src/PHYS/trchdf.f90 | 1 - 1 file changed, 1 deletion(-) diff --git a/src/PHYS/trchdf.f90 b/src/PHYS/trchdf.f90 index e571057d..2d4fc889 100644 --- a/src/PHYS/trchdf.f90 +++ b/src/PHYS/trchdf.f90 @@ -311,7 +311,6 @@ SUBROUTINE trchdf ! ... Mpp : export boundary values to neighboring processors CALL mpplnk_my(zlt,gpu=use_gpu) #else -#error CALL lbc( zlt(:,:,:), 1, 1, 1, 1, jpk, 1 ) #endif From 3918ff082c920e0d6f733c6b9dcd96afd9fe01c2 Mon Sep 17 00:00:00 2001 From: Loris Lucido Date: Mon, 13 May 2024 16:47:53 +0200 Subject: [PATCH 14/33] workaround: data not present on device --- src/PHYS/trczdf.f90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/PHYS/trczdf.f90 b/src/PHYS/trczdf.f90 index 76de5ef0..615883f7 100644 --- a/src/PHYS/trczdf.f90 +++ b/src/PHYS/trczdf.f90 @@ -113,12 +113,12 @@ SUBROUTINE trczdf END DO END DO END DO - !$acc enter data create(delta_tra,int_tra) !$acc update device(jarr_zdf,jarr_zdf_flx) #ifdef _OPENACC call myalloc_ZDF_gpu() #endif ENDIF + !$acc enter data create(delta_tra,int_tra) !! passive tracer slab From 7864fa822544ad40380ea6c66ed76d3e0a000954 Mon Sep 17 00:00:00 2001 From: Loris Lucido Date: Mon, 13 May 2024 16:58:48 +0200 Subject: [PATCH 15/33] trcbio GPU port --- src/BIO/BIO_mem.f90 | 17 +++- src/BIO/trcbio.f90 | 176 ++++++++++++++++++++---------------- src/BIO/trcsms.f90 | 5 + src/General/TimeManager.f90 | 1 + src/General/memory.f90 | 16 +++- 5 files changed, 135 insertions(+), 80 deletions(-) diff --git a/src/BIO/BIO_mem.f90 b/src/BIO/BIO_mem.f90 index fb5e3bd2..b3544716 100644 --- a/src/BIO/BIO_mem.f90 +++ b/src/BIO/BIO_mem.f90 @@ -18,6 +18,10 @@ MODULE BIO_mem double precision, allocatable :: ogstm_ph(:,:,:) ! GUESS for FOLLOWS algorithm double precision, allocatable :: NPPF2(:,:,:) double precision, allocatable :: ogstm_co2(:,:), co2_IO(:,:,:) + double precision, allocatable :: sediPPY(:,:) + double precision, allocatable :: local_D3DIAGNOS(:,:) + double precision, allocatable :: local_D2DIAGNOS(:,:) + double precision, allocatable :: er(:,:) double precision:: ice @@ -38,7 +42,7 @@ subroutine myalloc_BIO() allocate(co2_IO(jpj,jpi,2)) co2_IO = huge(co2_IO(1,1,1)) - allocate(ogstm_sedipi(jpk,jpj,jpi,4)) + allocate(ogstm_sedipi(jpk,jpj,jpi,4)) ogstm_sedipi = huge(ogstm_sedipi(1,1,1,1)) allocate(ogstm_ph(jpk,jpj,jpi)) ogstm_ph = huge(ogstm_ph(1,1,1)) @@ -48,6 +52,12 @@ subroutine myalloc_BIO() ! and used in hard_tissue_pump.F also in land points ice=0 + allocate(sediPPY(jpi * jpj * jpk, 4)) + allocate(local_D3DIAGNOS(jpi * jpj * jpk, jptra_dia)) + allocate(local_D2DIAGNOS(jpi * jpj, jptra_dia_2d)) + allocate(er(jpi * jpj * jpk, 11)) + !$acc enter data create(ogstm_co2,ogstm_sedipi,ogstm_ph,sediPPY,local_D3DIAGNOS,local_D2DIAGNOS,er) + #ifdef Mem_Monitor mem_all=get_mem(err) - aux_mem #endif @@ -63,6 +73,11 @@ subroutine clean_memory_bio() deallocate(ogstm_sedipi) deallocate(ogstm_ph) deallocate(NPPF2) + deallocate(sediPPY) + deallocate(local_D3DIAGNOS) + deallocate(local_D2DIAGNOS) + deallocate(er) + !$acc exit data delete(ogstm_co2,ogstm_sedipi,ogstm_ph,sediPPY,local_D3DIAGNOS,local_D2DIAGNOS,er) end subroutine clean_memory_bio diff --git a/src/BIO/trcbio.f90 b/src/BIO/trcbio.f90 index f84fe51e..d830a0af 100644 --- a/src/BIO/trcbio.f90 +++ b/src/BIO/trcbio.f90 @@ -49,22 +49,20 @@ SUBROUTINE trcbio IMPLICIT NONE - double precision, dimension(jpi * jpj * jpk, 4) :: sediPPY - double precision, dimension(jpi * jpj * jpk, jptra_dia) :: local_D3DIAGNOS - double precision, dimension(jpi * jpj, jptra_dia_2d) :: local_D2DIAGNOS - double precision, dimension(jpi * jpj * jpk, 11) :: er - - integer :: jk, jj, ji, jn, jlinear2d, jlinear3d, bottom + integer :: jk, jj, ji, jn, jlinear2d, jlinear3d, bottom, queue double precision :: correct_fact, gdept_local, gdeptmax_local integer :: year, month, day double precision :: sec - call tstart("trcbio_1") BIOparttime = MPI_WTIME() ! Initialization + + queue=1 + + !$acc kernels default(present) async(queue) D3STATE = 1.0 er = 1.0 er(:,10) = 8.1 @@ -73,66 +71,83 @@ SUBROUTINE trcbio ! ogstm_sediPI appear to be unused ogstm_sediPI = 0. + !$acc end kernels + + ! NOTE: this kernel *needs* to be executed synchronously as we need the + ! reduced `bottom` scalar value on host before launching the next kernel. + bottom=0 + !$acc parallel loop gang vector reduction(max:bottom) collapse(2) default(present) + do ji = 1, jpi + do jj = 1, jpj + bottom = max(bottom,mbathy(jj,ji)) + end do + end do + !$acc end parallel loop ! Set D3STATE (pass state to BFM) + !$acc parallel loop gang vector default(present) collapse(4) async(queue) do jn = 1, jptra do ji = 1, jpi do jj = 1, jpj - if (bfmmask(1, jj, ji) == 0) then - cycle - else - bottom = mbathy(jj, ji) - do jk = 1, bottom - jlinear3d = jk + (jj - 1) * jpk + (ji - 1) * jpk * jpj - D3STATE(jlinear3d, jn) = trn(jk, jj, ji, jn) + do jk = 1, bottom + if (.not. bfmmask(1, jj, ji) == 0) then + if (jk <= mbathy(jj, ji)) then + jlinear3d = jk + (jj - 1) * jpk + (ji - 1) * jpk * jpj + D3STATE(jlinear3d, jn) = trn(jk, jj, ji, jn) + endif + endif end do - end if end do end do end do + !$acc end parallel loop call read_date_string(COMMON_DATEstring, year, month, day, sec) - ! Set er + ! Set er + !$acc parallel loop gang vector default(present) collapse(3) async(queue) do ji = 1, jpi - do jj = 1, jpj - if (bfmmask(1, jj, ji) == 0) then - cycle - else - bottom = mbathy(jj, ji) + do jj = 1, jpj do jk = 1, bottom - jlinear3d = jk + (jj - 1) * jpk + (ji - 1) * jpk * jpj - if (jk .eq. 1) then - er(jlinear3d, 4) = ice ! from 0 to 1 adimensional - er(jlinear3d, 5) = ogstm_co2(jj, ji) ! CO2 Mixing Ratios (ppm) 390 - er(jlinear3d, 7) = DAY_LENGTH(jj, ji) ! fotoperiod expressed in hours - er(jlinear3d, 9) = vatm(jj, ji) ! wind speed (m/s) - end if - er(jlinear3d, 1) = tn(jk, jj, ji) ! Temperature (Celsius) - er(jlinear3d, 2) = sn(jk, jj, ji) ! Salinity PSU - er(jlinear3d, 3) = rho(jk, jj, ji) ! Density Kg/m3 - er(jlinear3d, 6) = instant_par_from_sec(sec, xpar(jk, jj, ji)) ! PAR umoles/m2/s | Watt to umoles photons W2E=1./0.217 - er(jlinear3d, 8) = e3t(jk, jj, ji) ! depth in meters of the given cell - er(jlinear3d, 10) = ogstm_PH(jk, jj, ji) ! 8.1 + if (.not. bfmmask(1, jj, ji) == 0) then + if (jk <= mbathy(jj, ji)) then + jlinear3d = jk + (jj - 1) * jpk + (ji - 1) * jpk * jpj + if (jk .eq. 1) then + er(jlinear3d, 4) = ice ! from 0 to 1 adimensional + er(jlinear3d, 5) = ogstm_co2(jj, ji) ! CO2 Mixing Ratios (ppm) 390 + er(jlinear3d, 7) = DAY_LENGTH(jj, ji) ! fotoperiod expressed in hours + er(jlinear3d, 9) = vatm(jj, ji) ! wind speed (m/s) + end if + er(jlinear3d, 1) = tn(jk, jj, ji) ! Temperature (Celsius) + er(jlinear3d, 2) = sn(jk, jj, ji) ! Salinity PSU + er(jlinear3d, 3) = rho(jk, jj, ji) ! Density Kg/m3 + er(jlinear3d, 6) = instant_par_from_sec(sec, xpar(jk, jj, ji)) ! PAR umoles/m2/s | Watt to umoles photons W2E=1./0.217 + er(jlinear3d, 8) = e3t(jk, jj, ji) ! depth in meters of the given cell + er(jlinear3d, 10) = ogstm_PH(jk, jj, ji) ! 8.1 #ifdef gdept1d - gdept_local = gdept(jk) - gdeptmax_local = gdept(jpk) + gdept_local = gdept(jk) + gdeptmax_local = gdept(jpk) #else - gdept_local = gdept(jk, jj, ji) - gdeptmax_local = gdept(jpk, jj, ji) + gdept_local = gdept(jk, jj, ji) + gdeptmax_local = gdept(jpk, jj, ji) #endif - if (gdept_local .lt. 1000.0D0) then - correct_fact = 1.0D0 - else if (gdept_local .lt. 2000.0D0) then - correct_fact = 0.25D0 - else - correct_fact = 0.0D0 - end if - er(jlinear3d, 11) = correct_fact * (gdeptmax_local - gdept_local) / gdept_local + if (gdept_local .lt. 1000.0D0) then + correct_fact = 1.0D0 + else if (gdept_local .lt. 2000.0D0) then + correct_fact = 0.25D0 + else + correct_fact = 0.0D0 + end if + er(jlinear3d, 11) = correct_fact * (gdeptmax_local - gdept_local) / gdept_local + endif + end if end do - end if - end do + end do end do + !$acc end parallel loop + + !$acc wait(queue) + !$acc update host(D3STATE,er) call tstop("trcbio_1") call tstart("BFM1D_In_EcologyDynamics") @@ -149,54 +164,61 @@ SUBROUTINE trcbio call tstop("BFM1D_Out_EcologyDynamics") call tstart("trcbio_2") - ! The following copies could be avoided + !$acc update device(D3SOURCE,sediPPY,local_D3DIAGNOS,local_D2DIAGNOS) + + !$acc parallel loop gang vector collapse(4) default(present) async(queue) do jn = 1, max(4, jptra, jptra_dia) do ji = 1, jpi do jj = 1, jpj - if (bfmmask(1, jj, ji) == 0) then - cycle - else - bottom = mbathy(jj, ji) - do jk = 1, bottom - jlinear3d = jk + (jj - 1) * jpk + (ji - 1) * jpk * jpj - if (jn .le. jptra) then - tra(jk, jj, ji, jn) = tra(jk, jj, ji, jn) + D3SOURCE(jlinear3d, jn) ! trend - end if - if (jn .le. jptra_dia) then - tra_DIA(jk, jj, ji, jn) = local_D3DIAGNOS(jlinear3d, jn) - end if - if (jn .le. 4) then - ogstm_sediPI(jk, jj, ji, jn) = sediPPY(jlinear3d, jn) ! BFM output of sedimentation speed (m/d) + do jk = 1, bottom + if (.not. bfmmask(1, jj, ji) == 0) then + if (jk <= mbathy(jj, ji)) then + jlinear3d = jk + (jj - 1) * jpk + (ji - 1) * jpk * jpj + if (jn .le. jptra) then + tra(jk, jj, ji, jn) = tra(jk, jj, ji, jn) + D3SOURCE(jlinear3d, jn) ! trend + endif + if (jn .le. jptra_dia) then + tra_DIA(jk, jj, ji, jn) = local_D3DIAGNOS(jlinear3d, jn) + endif + if (jn .le. 4) then + ogstm_sediPI(jk, jj, ji, jn) = sediPPY(jlinear3d, jn) ! BFM output of sedimentation speed (m/d) + endif end if - end do - end if + end if + end do end do end do end do - + !$acc end parallel loop + + !$acc parallel loop gang vector collapse(3) default(present) async(queue) do ji = 1, jpi do jj = 1, jpj - if (bfmmask(1, jj, ji) == 0) then - cycle - else - bottom = mbathy(jj, ji) - jlinear2d = jj + (ji - 1) * jpj - tra_DIA_2d(:, jj, ji) = local_D2DIAGNOS(jlinear2d, :) - do jk = 1, bottom - jlinear3d = jk + (jj - 1) * jpk + (ji - 1) * jpk * jpj - ogstm_PH(jk, jj, ji) = local_D3DIAGNOS(jlinear3d, pppH) ! Follows solver guess, put 8.0 if pppH is not defined - end do - end if + do jk = 1, bottom + if (.not. bfmmask(1, jj, ji) == 0) then + if (jk <= mbathy(jj, ji)) then + jlinear2d = jj + (ji - 1) * jpj + tra_DIA_2d(:, jj, ji) = local_D2DIAGNOS(jlinear2d, :) + jlinear3d = jk + (jj - 1) * jpk + (ji - 1) * jpk * jpj + ogstm_PH(jk, jj, ji) = local_D3DIAGNOS(jlinear3d, pppH) ! Follows solver guess, put 8.0 if pppH is not defined + end if + end if + end do end do end do + !$acc end parallel loop !--------------------------------------------------------------------- ! BEGIN BC_REFACTORING SECTION !--------------------------------------------------------------------- + ! XXX: when should we care about this ? call boundaries%fix_diagnostic_vars() !---------------------------------------------------------------------- ! END BC_REFACTORING SECTION !--------------------------------------------------------------------- + + !$acc wait(queue) + BIOparttime = MPI_WTIME() - BIOparttime BIOtottime = BIOtottime + BIOparttime call tstop("trcbio_2") diff --git a/src/BIO/trcsms.f90 b/src/BIO/trcsms.f90 index fc2ab56d..0973bf41 100644 --- a/src/BIO/trcsms.f90 +++ b/src/BIO/trcsms.f90 @@ -23,6 +23,9 @@ SUBROUTINE trcsms USE myalloc USE mpi use simple_timer + + ! XXX: to remove + use BIO_mem, only: ogstm_sediPI,ogstm_PH,ogstm_co2 IMPLICIT NONE @@ -36,7 +39,9 @@ SUBROUTINE trcsms CALL trcopt ! tracers: optical model call tstart("trcbio") + !$acc update device(mbathy,bfmmask,trn,DAY_LENGTH,vatm,tn,sn,rho,xpar,e3t,gdept,ogstm_PH,ogstm_co2) CALL trcbio ! tracers: biological model + !$acc update host(tra,tra_DIA,tra_DIA_2d,ogstm_sediPI,ogstm_PH) call tstop("trcbio") !! trcsed no updated for time step advancing diff --git a/src/General/TimeManager.f90 b/src/General/TimeManager.f90 index e095e8de..ba36df25 100644 --- a/src/General/TimeManager.f90 +++ b/src/General/TimeManager.f90 @@ -799,6 +799,7 @@ END FUNCTION INSTANT_PAR double precision FUNCTION INSTANT_PAR_from_sec(sec, MEAN_PAR) + !$acc routine seq IMPLICIT NONE double precision, INTENT(IN) :: MEAN_PAR ! daily integral ! LOCAL diff --git a/src/General/memory.f90 b/src/General/memory.f90 index 96db7c26..15a6a3cf 100644 --- a/src/General/memory.f90 +++ b/src/General/memory.f90 @@ -542,6 +542,7 @@ subroutine alloc_tot() allocate(gdept(jpk,jpj,jpi)) gdept = huge(gdept(1,1,1)) #endif + !$acc enter data create(gdept) allocate(gdepw(jpk)) gdepw = huge(gdepw(1)) @@ -567,7 +568,8 @@ subroutine alloc_tot() e3w = huge(e3w(1,1,1)) allocate(mbathy(jpj,jpi)) - mbathy = huge(mbathy(1,1)) + mbathy = huge(mbathy(1,1)) + !$acc enter data create(mbathy) allocate(tmask(jpk,jpj,jpi)) tmask = huge(tmask(1,1,1)) @@ -586,6 +588,7 @@ subroutine alloc_tot() allocate(bfmmask(jpk, jpj, jpi)) bfmmask = huge(bfmmask(1, 1, 1)) + !$acc enter data create(bfmmask) allocate(un(jpk,jpj,jpi)) un = huge(un(1,1,1)) @@ -603,6 +606,7 @@ subroutine alloc_tot() rhopn = huge(rhopn(1,1,1)) allocate(rho(jpk,jpj,jpi)) rho = huge(rho(1,1,1)) + !$acc enter data create(rho) allocate(ahtu(jpk)) ahtu = huge(ahtu(1)) @@ -781,7 +785,8 @@ subroutine alloc_tot() # endif #ifdef key_trc_bfm - allocate(xpar(jpk,jpj,jpi)) + allocate(xpar(jpk,jpj,jpi)) + !$acc enter data create(xpar) xpar = huge(xpar(1,1,1)) #endif @@ -789,6 +794,7 @@ subroutine alloc_tot() !! photoperiod allocate(DAY_LENGTH(jpj,jpi)) DAY_LENGTH = huge(DAY_LENGTH(1,1)) + !$acc enter data create(DAY_LENGTH) forcing_phys_initialized = .false. !$acc enter data create(e1t,e2t,e3t,e3w,e3t_back,tra,trb,tmask,umask,& @@ -869,6 +875,7 @@ subroutine clean_memory() deallocate(ff) deallocate(gdept) + !$acc exit data delete(gdept) deallocate(gdepw) deallocate(e3t_0) deallocate(e3u_0) @@ -882,6 +889,7 @@ subroutine clean_memory() deallocate(e3w) deallocate(mbathy) + !$acc exit data delete(mbathy) deallocate(tmask) deallocate(h_column) @@ -893,6 +901,7 @@ subroutine clean_memory() deallocate(vmask) deallocate(bfmmask) + !$acc exit data delete(bfmmask) deallocate(un) deallocate(vn) @@ -902,6 +911,7 @@ subroutine clean_memory() deallocate(rdn) deallocate(rhopn) deallocate(rho) + !$acc exit data delete(rho) deallocate(ahtu) deallocate(ahtv) @@ -994,9 +1004,11 @@ subroutine clean_memory() #ifdef key_trc_bfm deallocate(xpar) + !$acc exit data delete(xpar) #endif deallocate(DAY_LENGTH) + !$acc exit data delete(DAY_LENGTH) ! trclec From df705bb72f96657edfa6446641c5c046178c2669 Mon Sep 17 00:00:00 2001 From: Loris Lucido Date: Tue, 14 May 2024 09:15:15 +0200 Subject: [PATCH 16/33] BFM1D_Input_EcologyDynamics gpu port --- src/BIO/trcbio.f90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/BIO/trcbio.f90 b/src/BIO/trcbio.f90 index d830a0af..aaf6d90a 100644 --- a/src/BIO/trcbio.f90 +++ b/src/BIO/trcbio.f90 @@ -147,7 +147,7 @@ SUBROUTINE trcbio !$acc end parallel loop !$acc wait(queue) - !$acc update host(D3STATE,er) + !$acc update host(D3STATE) call tstop("trcbio_1") call tstart("BFM1D_In_EcologyDynamics") From 59731929325159f32588b40e91dcc9e2212fd544 Mon Sep 17 00:00:00 2001 From: Loris Lucido Date: Tue, 14 May 2024 16:19:57 +0200 Subject: [PATCH 17/33] BFM1D_Output_EcologyDynamics gpu port --- src/BIO/trcbio.f90 | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/BIO/trcbio.f90 b/src/BIO/trcbio.f90 index aaf6d90a..ac1860e1 100644 --- a/src/BIO/trcbio.f90 +++ b/src/BIO/trcbio.f90 @@ -147,7 +147,6 @@ SUBROUTINE trcbio !$acc end parallel loop !$acc wait(queue) - !$acc update host(D3STATE) call tstop("trcbio_1") call tstart("BFM1D_In_EcologyDynamics") @@ -164,8 +163,6 @@ SUBROUTINE trcbio call tstop("BFM1D_Out_EcologyDynamics") call tstart("trcbio_2") - !$acc update device(D3SOURCE,sediPPY,local_D3DIAGNOS,local_D2DIAGNOS) - !$acc parallel loop gang vector collapse(4) default(present) async(queue) do jn = 1, max(4, jptra, jptra_dia) do ji = 1, jpi From 6d62722e79ca3fba1b515d515817911f0224ba9f Mon Sep 17 00:00:00 2001 From: Loris Lucido Date: Wed, 15 May 2024 08:56:02 +0200 Subject: [PATCH 18/33] add more timers in trcsms --- src/BIO/trcsms.f90 | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/BIO/trcsms.f90 b/src/BIO/trcsms.f90 index 0973bf41..83b61914 100644 --- a/src/BIO/trcsms.f90 +++ b/src/BIO/trcsms.f90 @@ -36,7 +36,11 @@ SUBROUTINE trcsms !! this first routines are parallelized on vertical slab + call tstart("trcopt") + CALL trcopt ! tracers: optical model + + call tstop("trcopt") call tstart("trcbio") !$acc update device(mbathy,bfmmask,trn,DAY_LENGTH,vatm,tn,sn,rho,xpar,e3t,gdept,ogstm_PH,ogstm_co2) @@ -46,7 +50,11 @@ SUBROUTINE trcsms !! trcsed no updated for time step advancing #if defined key_trc_sed + call tstart("trcsed") + CALL trcsed ! tracers: sedimentation model + + call tstop("trcsed") # endif trcsmsparttime = MPI_WTIME() - trcsmsparttime ! cronometer-stop From 73eaf3fcd5e8af635e94b0d7b72ae8dc439fcc44 Mon Sep 17 00:00:00 2001 From: Loris Lucido Date: Wed, 15 May 2024 12:02:13 +0200 Subject: [PATCH 19/33] trcopt GPU port, XXX: could not be tested --- src/BIO/OPT_mem.f90 | 33 ++++++++++++--------- src/BIO/trcopt.f90 | 71 +++++++++++++++++++-------------------------- src/BIO/trcsms.f90 | 6 ++-- 3 files changed, 54 insertions(+), 56 deletions(-) diff --git a/src/BIO/OPT_mem.f90 b/src/BIO/OPT_mem.f90 index 60af75dc..a08e2228 100644 --- a/src/BIO/OPT_mem.f90 +++ b/src/BIO/OPT_mem.f90 @@ -18,8 +18,9 @@ MODULE OPT_mem INTEGER, allocatable :: itabe(:),imaske(:,:) - double precision, allocatable :: zpar(:,:),xEPS_ogstm(:,:) - double precision, allocatable :: zpar0m(:),zpar100(:) + ! double precision, allocatable :: zpar(:,:) + double precision, allocatable :: xEPS_ogstm(:,:) + ! double precision, allocatable :: zpar0m(:),zpar100(:) double precision, allocatable :: kef(:,:) double precision, allocatable :: kextIO(:,:,:) real, allocatable :: zkef_f (:,:) @@ -40,17 +41,21 @@ subroutine myalloc_OPT() allocate(imaske(jpk,jpi)) imaske = huge(imaske(1,1)) !!!$omp parallel default (none) shared(jpk,jpi) - allocate(zpar(jpk,jpi)) - zpar = huge(zpar(1,1)) - allocate(xEPS_ogstm(jpk,jpi)) + ! allocate(zpar(jpk,jpi)) + ! zpar = huge(zpar(1,1)) + allocate(xEPS_ogstm(jpk,jpi)) + !$acc enter data create(xEPS_ogstm) + !$acc kernels default(present) xEPS_ogstm = huge(xEPS_ogstm(1,1)) - allocate(zpar0m(jpi)) - zpar0m = huge(zpar0m(1)) - allocate(zpar100(jpi)) - zpar100 = huge(zpar100(1)) + !$acc end kernels + ! allocate(zpar0m(jpi)) + ! zpar0m = huge(zpar0m(1)) + ! allocate(zpar100(jpi)) + ! zpar100 = huge(zpar100(1)) !!!$omp end parallel - allocate(kef(jpj,jpi)) + allocate(kef(jpj,jpi)) + !$acc enter data create(kef) kef = huge(kef(1,1)) allocate(kextIO(jpj,jpi,2)) kextIO = huge(kextIO(1,1,1)) @@ -71,10 +76,12 @@ subroutine clean_memory_opt deallocate(itabe) deallocate(imaske) - deallocate(zpar) + ! deallocate(zpar) + !$acc exit data delete(xEPS_ogstm) deallocate(xEPS_ogstm) - deallocate(zpar0m) - deallocate(zpar100) + ! deallocate(zpar0m) + ! deallocate(zpar100) + !$acc exit data delete(kef) deallocate(kef) deallocate(kextIO) diff --git a/src/BIO/trcopt.f90 b/src/BIO/trcopt.f90 index b0f21d82..3850c570 100644 --- a/src/BIO/trcopt.f90 +++ b/src/BIO/trcopt.f90 @@ -18,7 +18,7 @@ SUBROUTINE trcopt double precision :: conversion #if defined key_trc_nnpzddom || defined key_trc_npzd || key_trc_bfm - INTEGER :: jk,jj,ji + INTEGER :: jk,jj,ji,queue INTEGER :: mytid, ntids! omp variables trcoptparttime = MPI_WTIME() ! cronometer-start @@ -33,48 +33,37 @@ SUBROUTINE trcopt !!!$omp& shared(jj,jpk,jpj,jpi,xpar,conversion,kef,e3t,qsr) ! 1. determination of surface irradiance - DO ji = 1,jpi - DO jj = 1,jpj - - zpar0m(ji) = qsr(jj,ji)*conversion - zpar100(ji) = zpar0m(ji)*0.01 - xpar(1,jj,ji) = zpar0m(ji) - zpar(1,ji) = zpar0m(1) - xEPS_ogstm(1,ji) = kef(jj,ji) - - ! ENDDO - ! ENDDO -!! 2. determination of xpar -!! ------------------------ - ! DO ji = 1,jpi - ! DO jj = 1,jpj - - DO jk = 2,jpk - xEPS_ogstm(jk,ji) = max(kef(jj,ji),1.D-15) ! avoid denormalized number + queue=1 + !$acc kernels default(present) async(queue) + DO ji = 1,jpi + DO jj = 1,jpj + + ! zpar0m(ji) = qsr(jj,ji)*conversion + ! zpar100(ji) = zpar0m(ji)*0.01 + xpar(1,jj,ji) = qsr(jj,ji)*conversion + ! zpar(1,ji) = zpar0m(1) + xEPS_ogstm(1,ji) = kef(jj,ji) + + !! 2. determination of xpar + !! ------------------------ + + DO jk = 2,jpk + xEPS_ogstm(jk,ji) = max(kef(jj,ji),1.D-15) ! avoid denormalized number + END DO + + DO jk = 2,jpk + xpar(jk,jj,ji) = max( xpar(jk-1,jj,ji) *exp(-1. * xEPS_ogstm(jk-1,ji)* e3t(jk-1,jj,ji)) ,1.D-15) ! avoid denormalized number + + END DO + + DO jk = 1,jpk + xpar(jk,jj,ji) = max( xpar(jk,jj,ji) * exp(- xEPS_ogstm(jk,ji)* 0.5D+00* e3t(jk,jj,ji) ) ,1.D-15) + END DO + END DO - ! END DO - ! ENDDO - - ! DO ji = 1,jpi - ! DO jj = 1,jpj - DO jk = 2,jpk - !print * ,"CHECK",xpar(jk-1,jj,ji),xEPS_ogstm(jk-1,ji), - xpar(jk,jj,ji) = max( xpar(jk-1,jj,ji) *exp(-1. * xEPS_ogstm(jk-1,ji)* e3t(jk-1,jj,ji)) ,1.D-15) ! avoid denormalized number - - END DO - ! END DO - ! ENDDO - - ! DO ji = 1,jpi - ! DO jj = 1,jpj - DO jk = 1,jpk - xpar(jk,jj,ji) = max( xpar(jk,jj,ji) * exp(- xEPS_ogstm(jk,ji)* 0.5D+00* e3t(jk,jj,ji) ) ,1.D-15) - END DO - - END DO ENDDO - - + !$acc end kernels + !$acc wait(queue) trcoptparttime = MPI_WTIME() - trcoptparttime ! cronometer-stop trcopttottime = trcopttottime + trcoptparttime diff --git a/src/BIO/trcsms.f90 b/src/BIO/trcsms.f90 index 83b61914..39a2aa8f 100644 --- a/src/BIO/trcsms.f90 +++ b/src/BIO/trcsms.f90 @@ -26,6 +26,7 @@ SUBROUTINE trcsms ! XXX: to remove use BIO_mem, only: ogstm_sediPI,ogstm_PH,ogstm_co2 + USE OPT_mem, only: kef IMPLICIT NONE @@ -38,12 +39,13 @@ SUBROUTINE trcsms call tstart("trcopt") + !$acc update device(kef,qsr) CALL trcopt ! tracers: optical model call tstop("trcopt") - + call tstart("trcbio") - !$acc update device(mbathy,bfmmask,trn,DAY_LENGTH,vatm,tn,sn,rho,xpar,e3t,gdept,ogstm_PH,ogstm_co2) + !$acc update device(mbathy,bfmmask,trn,DAY_LENGTH,vatm,tn,sn,rho,e3t,gdept,ogstm_PH,ogstm_co2) CALL trcbio ! tracers: biological model !$acc update host(tra,tra_DIA,tra_DIA_2d,ogstm_sediPI,ogstm_PH) call tstop("trcbio") From c0527c64a5e77e9eaa3c29850856a0b8c7fae945 Mon Sep 17 00:00:00 2001 From: Loris Lucido Date: Wed, 15 May 2024 15:43:59 +0200 Subject: [PATCH 20/33] trcsed GPU port --- src/BIO/SED_mem.f90 | 26 +++-- src/BIO/trcopt.f90 | 2 + src/BIO/trcsed.f90 | 237 ++++++++++++++++++++++++++----------------- src/BIO/trcsms.f90 | 7 -- src/General/step.f90 | 13 +++ 5 files changed, 178 insertions(+), 107 deletions(-) diff --git a/src/BIO/SED_mem.f90 b/src/BIO/SED_mem.f90 index 0062dce6..ffdf2b79 100644 --- a/src/BIO/SED_mem.f90 +++ b/src/BIO/SED_mem.f90 @@ -46,7 +46,7 @@ subroutine myalloc_SED() #endif dimen_jvsed=0 - allocate(sed_idx(nsed)) + allocate(sed_idx(nsed)) sed_idx = huge(sed_idx(1)) sed_idx(1) = ppR6c @@ -78,11 +78,8 @@ subroutine myalloc_SED() allocate(jarr_sed(2, jpi*jpj)) jarr_sed = huge(jarr_sed(1,1)) allocate(jarr_sed_flx(jpk,jpi*jpj)) - jarr_sed_flx = huge(jarr_sed_flx(1,1)) - allocate( ztra(nsed,ntids)) - ztra = huge(ztra(1,1)) - allocate(zwork(jpk,nsed, ntids)) - zwork = huge(zwork(1,1,1)) + jarr_sed_flx = huge(jarr_sed_flx(1,1)) + !$acc enter data create(sed_idx,jarr_sed,jarr_sed_flx) #ifdef Mem_Monitor @@ -90,11 +87,22 @@ subroutine myalloc_SED() #endif END subroutine myalloc_SED - - - + + subroutine myalloc_SED_ztra_zwork() + + allocate(ztra(nsed,dimen_jvsed)) + allocate(zwork(jpk,nsed,dimen_jvsed)) + !$acc enter data create(ztra,zwork) + !$acc kernels default(present) + ztra = huge(ztra(1,1)) + zwork = huge(zwork(1,1,1)) + !$acc end kernels + + end subroutine myalloc_SED_ztra_zwork + subroutine clean_memory_sed + !$acc exit data delete(ztra,zwork,sed_idx,jarr_sed,jarr_sed_flx) deallocate(sed_idx) deallocate(jarr_sed) deallocate(jarr_sed_flx) diff --git a/src/BIO/trcopt.f90 b/src/BIO/trcopt.f90 index 3850c570..e76308f6 100644 --- a/src/BIO/trcopt.f90 +++ b/src/BIO/trcopt.f90 @@ -34,6 +34,8 @@ SUBROUTINE trcopt ! 1. determination of surface irradiance queue=1 + ! XXX: this can be improved but no way to test as it doesn't seem to + ! impact numerical results !$acc kernels default(present) async(queue) DO ji = 1,jpi DO jj = 1,jpj diff --git a/src/BIO/trcsed.f90 b/src/BIO/trcsed.f90 index b9c5c484..f5e22aff 100644 --- a/src/BIO/trcsed.f90 +++ b/src/BIO/trcsed.f90 @@ -61,8 +61,8 @@ SUBROUTINE trcsed #ifdef key_trc_bfm LOGICAL :: l1,l2,l3 - INTEGER :: ji,jj,jk,jv,jf,js - INTEGER :: bottom + INTEGER :: ji,jj,jk,jv,jf,js,ntx + INTEGER :: bottom,queue double precision :: ze3tr,d2s ! omp variables @@ -71,7 +71,7 @@ SUBROUTINE trcsed !! =================== - + queue=1 d2s=1./3600./24. ! speed from (m/day) to (m/s) @@ -100,107 +100,162 @@ SUBROUTINE trcsed END DO END DO END DO - + call myalloc_SED_ztra_zwork() + !$acc update device(jarr_sed,jarr_sed_flx) ENDIF ! End initialization phase (once at the beginning) ! vertical slab ! ============= + ! if( mytid + jv <= dimen_jvsed) then + ! 1. sedimentation of detritus : upstream scheme + ! ----------------------------------------------- + ! 1.1 initialisation needed for bottom and surface value + !$acc parallel loop gang vector collapse(3) default(present) async(queue) + DO jv=1,dimen_jvsed + DO js = 1, nsed + DO jk = 1,jpk + ntx=jv + ji = jarr_sed(2,jv) + jj = jarr_sed(1,jv) + zwork(jk,js,ntx) = 0. + + END DO + END DO + END DO + + ! 1.2 tracer flux at w-point: we use -vsed (downward flux) + ! with simplification : no e1*e2 + + ! Particulate + !$acc parallel loop gang vector collapse(3) default(present) async(queue) + DO jv=1,dimen_jvsed + DO js =1,4 + DO jk = 2,jpkm1 + ntx=jv + ji = jarr_sed(2,jv) + jj = jarr_sed(1,jv) + zwork(jk,js,ntx) = -vsed * trn(jk-1,jj,ji, sed_idx(js)) + END DO + END DO + END DO + + ! Diatoms + !$acc parallel loop gang vector collapse(3) default(present) async(queue) + DO jv=1,dimen_jvsed + DO js =5,9 + DO jk = 2,jpkm1 + ntx=jv + ji = jarr_sed(2,jv) + jj = jarr_sed(1,jv) + zwork(jk,js,ntx) = -ogstm_sedipi(jk-1,jj,ji,1) * trn(jk-1,jj,ji, sed_idx(js)) + END DO + END DO + END DO + + ! Flagellates + !$acc parallel loop gang vector collapse(3) default(present) async(queue) + DO jv=1,dimen_jvsed + DO js =10,13 + DO jk = 2,jpkm1 + ntx=jv + ji = jarr_sed(2,jv) + jj = jarr_sed(1,jv) + zwork(jk,js,ntx) = -ogstm_sedipi(jk-1,jj,ji,2) * trn(jk-1,jj,ji, sed_idx(js)) + END DO + END DO + END DO + + ! Picophytoplankton + !$acc parallel loop gang vector collapse(3) default(present) async(queue) + DO jv=1,dimen_jvsed + DO js =14,17 + DO jk = 2,jpkm1 + ntx=jv + ji = jarr_sed(2,jv) + jj = jarr_sed(1,jv) + zwork(jk,js,ntx) = -ogstm_sedipi(jk-1,jj,ji,3) * trn(jk-1,jj,ji, sed_idx(js)) + END DO + END DO + END DO + + ! Dinoflagellates + !$acc parallel loop gang vector collapse(3) default(present) async(queue) + DO jv=1,dimen_jvsed + DO js =18,21 + DO jk = 2,jpkm1 + ntx=jv + ji = jarr_sed(2,jv) + jj = jarr_sed(1,jv) + zwork(jk,js,ntx) = -ogstm_sedipi(jk-1,jj,ji,4) * trn(jk-1,jj,ji, sed_idx(js)) + END DO + END DO + END DO + + ! Calcite + !$acc parallel loop gang vector collapse(3) default(present) async(queue) + DO jv=1,dimen_jvsed + DO js =22,22 + DO jk = 2,jpkm1 + ntx=jv + ji = jarr_sed(2,jv) + jj = jarr_sed(1,jv) + zwork(jk,js,ntx) = - vsedO5c * trn(jk-1,jj,ji, sed_idx(js)) + END DO + END DO + END DO + + !$acc parallel loop gang vector collapse(2) default(present) async(queue) + DO jv=1,dimen_jvsed + DO js = 1,nsed + ntx=jv + ji = jarr_sed(2,jv) + jj = jarr_sed(1,jv) + bottom = mbathy(jj,ji) + 1 + zwork(bottom,js,ntx) = bottom_flux * zwork(bottom,js,ntx) ! bottom_flux = 0 -> no flux in the sea floor + END DO + END DO + + ! 1.3 tracer flux divergence at t-point added to the general trend + + !$acc parallel loop gang vector collapse(2) default(present) async(queue) + DO jv=1,dimen_jvsed + DO jk = 1,jpkm1 + ntx=jv + ji = jarr_sed(2,jv) + jj = jarr_sed(1,jv) + + jf= jarr_sed_flx(jk,jV) + + ze3tr = 1./e3t(jk,jj,ji) + + DO js =1,nsed + ztra(js,ntx) = -ze3tr * (zwork(jk,js,ntx) - zwork(jk+1,js,ntx)) + IF ((Fsize .GT. 0) .AND. (jf .GT. 0)) THEN + diaflx(4,jf,sed_idx(js)) = diaflx(4, jf,sed_idx(js)) + zwork(jk,js,ntx)*rdt + ENDIF - MAIN_LOOP: DO jv=1,dimen_jvsed - -! if( mytid + jv <= dimen_jvsed) then -! 1. sedimentation of detritus : upstream scheme -! ----------------------------------------------- -! 1.1 initialisation needed for bottom and surface value - - ji = jarr_sed(2,jv) - jj = jarr_sed(1,jv) - - - DO js = 1, nsed - DO jk = 1,jpk - - zwork(jk,js,1) = 0. - - END DO - END DO - -! 1.2 tracer flux at w-point: we use -vsed (downward flux) -! with simplification : no e1*e2 - - - -! Particulate - DO js =1,4 - DO jk = 2,jpkm1 - zwork(jk,js,1) = -vsed * trn(jk-1,jj,ji, sed_idx(js)) - END DO - END DO -! Diatoms - DO js =5,9 - DO jk = 2,jpkm1 - zwork(jk,js,1) = -ogstm_sedipi(jk-1,jj,ji,1) * trn(jk-1,jj,ji, sed_idx(js)) - END DO - END DO -! Flagellates - DO js =10,13 - DO jk = 2,jpkm1 - zwork(jk,js,1) = -ogstm_sedipi(jk-1,jj,ji,2) * trn(jk-1,jj,ji, sed_idx(js)) - END DO - END DO -! Picophytoplankton - DO js =14,17 - DO jk = 2,jpkm1 - zwork(jk,js,1) = -ogstm_sedipi(jk-1,jj,ji,3) * trn(jk-1,jj,ji, sed_idx(js)) - END DO - END DO -! Dinoflagellates - DO js =18,21 - DO jk = 2,jpkm1 - zwork(jk,js,1) = -ogstm_sedipi(jk-1,jj,ji,4) * trn(jk-1,jj,ji, sed_idx(js)) - END DO - END DO - -! Calcite - DO js =22,22 - DO jk = 2,jpkm1 - zwork(jk,js,1) = - vsedO5c * trn(jk-1,jj,ji, sed_idx(js)) - END DO - END DO - bottom = mbathy(jj,ji) + 1 - zwork(bottom,:,1) = bottom_flux * zwork(bottom,:,1) ! bottom_flux = 0 -> no flux in the sea floor - -! 1.3 tracer flux divergence at t-point added to the general trend - - DO jk = 1,jpkm1 - jf= jarr_sed_flx(jk,jV) - - ze3tr = 1./e3t(jk,jj,ji) - - DO js =1,nsed - ztra(js,1) = -ze3tr * (zwork(jk,js,1) - zwork(jk+1,js,1)) - IF ((Fsize .GT. 0) .AND. (jf .GT. 0)) THEN - diaflx(4,jf,sed_idx(js)) = diaflx(4, jf,sed_idx(js)) + zwork(jk,js,1)*rdt - ENDIF - END DO - - DO js =1,nsed !!! d2s convert speed from (m/day) to (m/s) - tra(jk,jj,ji,sed_idx(js)) = tra(jk,jj,ji,sed_idx(js)) + ztra(js,1)*d2s - END DO + tra(jk,jj,ji,sed_idx(js)) = tra(jk,jj,ji,sed_idx(js)) + ztra(js,ntx)*d2s + END DO + END DO + END DO + !$acc end parallel loop + !$acc wait(queue) #ifdef key_trc_diabio - trbio(jk,jj,ji,8) = ztra -#endif - - END DO + DO jv=1,dimen_jvsed + ji = jarr_sed(2,jv) + jj = jarr_sed(1,jv) + DO jk = 1,jpkm1 + trbio(jk,jj,ji,8) = ztra + END DO + END DO #endif - - END DO MAIN_LOOP +#endif ! key_trc_bfm !!!$omp end parallel do diff --git a/src/BIO/trcsms.f90 b/src/BIO/trcsms.f90 index 39a2aa8f..d28510b0 100644 --- a/src/BIO/trcsms.f90 +++ b/src/BIO/trcsms.f90 @@ -24,9 +24,6 @@ SUBROUTINE trcsms USE mpi use simple_timer - ! XXX: to remove - use BIO_mem, only: ogstm_sediPI,ogstm_PH,ogstm_co2 - USE OPT_mem, only: kef IMPLICIT NONE @@ -34,20 +31,16 @@ SUBROUTINE trcsms trcsmsparttime = MPI_WTIME() ! cronometer-start - !! this first routines are parallelized on vertical slab call tstart("trcopt") - !$acc update device(kef,qsr) CALL trcopt ! tracers: optical model call tstop("trcopt") call tstart("trcbio") - !$acc update device(mbathy,bfmmask,trn,DAY_LENGTH,vatm,tn,sn,rho,e3t,gdept,ogstm_PH,ogstm_co2) CALL trcbio ! tracers: biological model - !$acc update host(tra,tra_DIA,tra_DIA_2d,ogstm_sediPI,ogstm_PH) call tstop("trcbio") !! trcsed no updated for time step advancing diff --git a/src/General/step.f90 b/src/General/step.f90 index 79d7b1fa..3c2f9b0e 100644 --- a/src/General/step.f90 +++ b/src/General/step.f90 @@ -293,6 +293,9 @@ SUBROUTINE trcstp ! XXX: to be removed use DIA_mem, only: diaflx,flx_ridxt use myalloc, only: tra,trb,e1t,e3t_back,e2t,e3t,e3w,umask,vmask,tmask,avt,ahtt + use BIO_mem, only: ogstm_sediPI,ogstm_PH,ogstm_co2 + USE OPT_mem, only: kef + USE SED_mem use simple_timer IMPLICIT NONE @@ -337,7 +340,17 @@ SUBROUTINE trcstp call tstop("trcsbc") call tstart("trcsms") + + !$acc update device(kef,qsr,mbathy,bfmmask,trn,DAY_LENGTH,vatm,tn,sn,rho,e3t,gdept,ogstm_PH,ogstm_co2) if(lbfm) +#if defined key_trc_sed + !$acc update device(sed_idx,diaflx,e3t,tra,ogstm_sedipi,mbathy) if(lbfm) +#endif IF (lbfm) CALL trcsms + !$acc update host(tra,tra_DIA,tra_DIA_2d,ogstm_sediPI,ogstm_PH) if(lbfm) +#if defined key_trc_sed + !$acc update host(diaflx,zwork) if(lbfm) +#endif + call tstop("trcsms") call tstart("trczdf") From 0f6df5b4e78f1310f7bdbd7e64809b4ed6f6bf30 Mon Sep 17 00:00:00 2001 From: Loris Lucido Date: Fri, 17 May 2024 07:48:37 +0200 Subject: [PATCH 21/33] snutel and trcnxt GPU port --- src/BIO/FN_mem.f90 | 8 +++++++- src/General/step.f90 | 4 ++++ src/PHYS/snutel.f90 | 20 +++++++++++++------- src/PHYS/trcnxt.f90 | 29 +++++++++++++++-------------- 4 files changed, 39 insertions(+), 22 deletions(-) diff --git a/src/BIO/FN_mem.f90 b/src/BIO/FN_mem.f90 index dafa86ec..57b3f166 100644 --- a/src/BIO/FN_mem.f90 +++ b/src/BIO/FN_mem.f90 @@ -47,8 +47,11 @@ subroutine myalloc_FN() allocate(jarr_snu(2, jpi*jpj)) jarr_snu = huge(jarr_snu(1,1)) - allocate(tra_FN(jpk,jpj,jpi,jptra)) + allocate(tra_FN(jpk,jpj,jpi,jptra)) + !$acc enter data create(tra_FN) + !$acc kernels default(present) tra_FN = huge(tra_FN(1,1,1,1)) + !$acc end kernels !CALL OPA_elements(elements,nelements,idx_element) @@ -66,7 +69,9 @@ subroutine myalloc_FN() FN_ranking = huge(FN_ranking(1)) + !$acc kernels default(present) tra_FN=0. + !$acc end kernels ! cor_FN=0. FN_ranking=0. @@ -81,6 +86,7 @@ END subroutine myalloc_FN subroutine clean_memory_fn + !$acc exit data delete(tra_FN) deallocate(jarr_snu) deallocate(tra_FN) deallocate(TOTcalc) diff --git a/src/General/step.f90 b/src/General/step.f90 index 3c2f9b0e..bc2bd2ef 100644 --- a/src/General/step.f90 +++ b/src/General/step.f90 @@ -362,7 +362,9 @@ SUBROUTINE trcstp call tstop("trczdf") call tstart("snutel") + !$acc update device(tmask,tra) if(lsnu) IF (lsnu) CALL snutel + !$acc update host(tra) if(lsnu) call tstop("snutel") call boundaries%apply_dirichlet() @@ -370,7 +372,9 @@ SUBROUTINE trcstp ! CALL checkValues call tstart("trcnxt") + !$acc update device(tra,tmask) CALL trcnxt ! tracers: fields at next time step + !$acc update host(trb,trn,tra) call tstop("trcnxt") trcstpparttime = MPI_WTIME() - trcstpparttime ! cronometer-stop diff --git a/src/PHYS/snutel.f90 b/src/PHYS/snutel.f90 index d5adbb08..b30cd36e 100644 --- a/src/PHYS/snutel.f90 +++ b/src/PHYS/snutel.f90 @@ -29,13 +29,17 @@ SUBROUTINE snutel() ! omp variables INTEGER :: jk,jj,ji,jn,jv! ,jnn,gji,gjj + INTEGER :: queue double precision :: zfact,zdt !!---------------------------------------------------------------------- !! statement functions !! =================== + queue=1 + !$acc kernels default(present) async(queue) TRA_FN = 0.0 + !$acc end kernels ! SMALL = 0.00000000001 !**************** INIT PHASE ********************* @@ -63,27 +67,29 @@ SUBROUTINE snutel() !!!$omp& shared(jn,jpk,jpj,jpi,tra,tmask,tra_FN,SMALL) - DO ji = 1,jpi - DO jj = 1,jpj - DO jk = 1,jpk - - if (tmask(jk,jj,ji).ne.0.0) then + !$acc parallel loop collapse(3) default(present) async(queue) + DO ji = 1,jpi + DO jj = 1,jpj + DO jk = 1,jpk + if (tmask(jk,jj,ji).ne.0.0) then if( tra(jk,jj,ji,jn) .GT. 0. ) then else tra_FN(jk,jj,ji,jn) = - tra(jk,jj,ji,jn) + SMALL tra( jk,jj,ji,jn) = SMALL end if - endif - END DO + endif END DO END DO + END DO !!!$omp end parallel END DO TRACER_LOOP + !$acc wait(queue) + !! Frequency of correction if plus module of kt diff --git a/src/PHYS/trcnxt.f90 b/src/PHYS/trcnxt.f90 index 8c5ee479..760875d1 100644 --- a/src/PHYS/trcnxt.f90 +++ b/src/PHYS/trcnxt.f90 @@ -68,13 +68,13 @@ SUBROUTINE trcnxt ! ... Mpp : export boundary values to neighboring processors - CALL mpplnk_my(tra(1,1,1,jn)) + CALL mpplnk_my(tra(1,1,1,jn), gpu=.true.) # else ! ... T-point, 3D array, full array tra(1,1,1,jn) is initialised - CALL lbc( tra(1,1,1,jn), 1, 1, 1, 1, jpk, 1 ) + CALL lbc( tra(1,1,1,jn), 1, 1, 1, 1, jpk, 1, use_gpu=.true.) #endif @@ -84,18 +84,19 @@ SUBROUTINE trcnxt - DO ji = 1,jpi - DO jj = 1,jpj - DO jk = 1,jpk - - !tra(jk,jj,ji,jn ) = tra(jk,jj,ji,jn )*e3t_back(jk,jj,ji)/e3t(jk,jj,ji) - trb(jk,jj,ji,jn ) = tra(jk,jj,ji,jn ) - trn(jk,jj,ji,jn ) = tra(jk,jj,ji,jn )*tmask(jk,jj,ji) - tra(jk,jj,ji,jn ) = 0.e0 - ! print *,jk,jj,ji,trb(jk,jj,ji,jn ),trn(jk,jj,ji,jn ),e3t_back(jk,jj,ji),e3t(jk,jj,ji) + !$acc parallel loop collapse(3) default(present) + DO ji = 1,jpi + DO jj = 1,jpj + DO jk = 1,jpk + + !tra(jk,jj,ji,jn ) = tra(jk,jj,ji,jn )*e3t_back(jk,jj,ji)/e3t(jk,jj,ji) + trb(jk,jj,ji,jn ) = tra(jk,jj,ji,jn ) + trn(jk,jj,ji,jn ) = tra(jk,jj,ji,jn )*tmask(jk,jj,ji) + tra(jk,jj,ji,jn ) = 0.e0 + ! print *,jk,jj,ji,trb(jk,jj,ji,jn ),trn(jk,jj,ji,jn ),e3t_back(jk,jj,ji),e3t(jk,jj,ji) + END DO END DO - END DO - END DO + END DO @@ -107,7 +108,7 @@ SUBROUTINE trcnxt END DO TRACER_LOOP - + trcnxtparttime = MPI_WTIME() - trcnxtparttime ! cronometer-stop trcnxttottime = trcnxttottime + trcnxtparttime From 63616e2b6e1624d5d84c89d6e30bbf94f56d9863 Mon Sep 17 00:00:00 2001 From: Loris Lucido Date: Fri, 17 May 2024 08:24:48 +0200 Subject: [PATCH 22/33] trcadv: allocate once --- src/PHYS/trcadv.f90 | 49 ++++++++++++++++----------------------------- 1 file changed, 17 insertions(+), 32 deletions(-) diff --git a/src/PHYS/trcadv.f90 b/src/PHYS/trcadv.f90 index 156e1381..70b703fc 100644 --- a/src/PHYS/trcadv.f90 +++ b/src/PHYS/trcadv.f90 @@ -81,9 +81,9 @@ SUBROUTINE trcadv double precision :: timer double precision,dimension(:), allocatable :: array double precision,dimension(:,:), allocatable :: surface - double precision, allocatable,dimension(:,:,:) :: zti,ztj - double precision, allocatable,dimension(:,:,:) :: zx,zy,zz,zbuf - double precision, allocatable,dimension(:,:,:) :: zkx,zky,zkz + double precision, save,allocatable,dimension(:,:,:) :: zti,ztj + double precision, save,allocatable,dimension(:,:,:) :: zx,zy,zz,zbuf + double precision, save,allocatable,dimension(:,:,:) :: zkx,zky,zkz logical :: use_gpu queue=1 @@ -171,29 +171,29 @@ SUBROUTINE trcadv write(*,*) 'trcadv: RANK -> ', myrank, ' good_points -> ', goodpoints + call tstart("trcadv_alloc") + allocate(zy(jpk,jpj,jpi)) + allocate(zx(jpk,jpj,jpi)) + allocate(zz(jpk,jpj,jpi)) + allocate(ztj(jpk,jpj,jpi)) + allocate(zti(jpk,jpj,jpi)) + allocate(zkx(jpk,jpj,jpi)) + allocate(zky(jpk,jpj,jpi)) + allocate(zkz(jpk,jpj,jpi)) + allocate(zbuf(jpk,jpj,jpi)) + + !$acc enter data create(zy,zx,zz,ztj,zti,zkx,zky,zkz,zbuf) + + call tstop("trcadv_alloc") adv_initialized=.true. endif call tstop("trcadv_init") - call tstart("trcadv_alloc") !!OpenMP compatibility broken. Possibility to use ifndef OpenMP + rename the file in trcadv.F90 to keep it - allocate(zy(jpk,jpj,jpi)) - allocate(zx(jpk,jpj,jpi)) - allocate(zz(jpk,jpj,jpi)) - allocate(ztj(jpk,jpj,jpi)) - allocate(zti(jpk,jpj,jpi)) - allocate(zkx(jpk,jpj,jpi)) - allocate(zky(jpk,jpj,jpi)) - allocate(zkz(jpk,jpj,jpi)) - allocate(zbuf(jpk,jpj,jpi)) - - !$acc enter data create(zy,zx,zz,ztj,zti,zkx,zky,zkz,zbuf) - - call tstop("trcadv_alloc") !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! end initialization phase @@ -953,22 +953,7 @@ SUBROUTINE trcadv !$acc update host( diaflx(1:7, 1:Fsize, 1:jptra) ) !$acc update host( tra(1:jpk,1:jpj,1:jpi,1:jptra) ) - !$acc update host( zy(1:jpk,1:jpj,1:jpi), zx(1:jpk,1:jpj,1:jpi), zz(1:jpk,1:jpj,1:jpi) ) - !$acc update host( ztj(1:jpk,1:jpj,1:jpi), zti(1:jpk,1:jpj,1:jpi) ) - !$acc update host( zkx(1:jpk,1:jpj,1:jpi), zky(1:jpk,1:jpj,1:jpi), zkz(1:jpk,1:jpj,1:jpi) ) - !$acc update host( zbuf(1:jpk,1:jpj,1:jpi) ) - - !$acc exit data delete( zy, zx, zz, ztj, zti, zkx, zky, zkz, zbuf ) finalize !!OpenMP compatibility broken. Possibility to use ifndef OpenMP + rename the file in trcadv.F90 to keep it - deallocate(zy ) - deallocate(zx ) - deallocate(zz ) - deallocate(ztj ) - deallocate(zti ) - deallocate(zkx ) - deallocate(zky ) - deallocate(zkz ) - deallocate(zbuf ) trcadvparttime = MPI_WTIME() - trcadvparttime trcadvtottime = trcadvtottime + trcadvparttime From e8a67dc1f8bda2ecdb8b2e131e96e6fcaa44cdcc Mon Sep 17 00:00:00 2001 From: Loris Lucido Date: Fri, 17 May 2024 11:24:10 +0200 Subject: [PATCH 23/33] trcsbc GPU port --- src/General/memory.f90 | 4 +++- src/General/step.f90 | 2 ++ src/PHYS/trcsbc.f90 | 19 ++++++++++--------- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/src/General/memory.f90 b/src/General/memory.f90 index 15a6a3cf..29a9bf40 100644 --- a/src/General/memory.f90 +++ b/src/General/memory.f90 @@ -602,7 +602,8 @@ subroutine alloc_tot() sn = huge(sn(1,1,1)) allocate(rdn(jpk,jpj,jpi)) rdn = huge(rdn(1,1,1)) - allocate(rhopn(jpk,jpj,jpi)) + allocate(rhopn(jpk,jpj,jpi)) + !$acc enter data create(rhopn) rhopn = huge(rhopn(1,1,1)) allocate(rho(jpk,jpj,jpi)) rho = huge(rho(1,1,1)) @@ -910,6 +911,7 @@ subroutine clean_memory() deallocate(sn) deallocate(rdn) deallocate(rhopn) + !$acc exit data delete(rhopn) deallocate(rho) !$acc exit data delete(rho) diff --git a/src/General/step.f90 b/src/General/step.f90 index bc2bd2ef..c6eb83e6 100644 --- a/src/General/step.f90 +++ b/src/General/step.f90 @@ -336,7 +336,9 @@ SUBROUTINE trcstp ! tracers: sink and source (must be parallelized on vertical slab) call tstart("trcsbc") + !$acc update device(e3t,rhopn,tmask,emp,trn,tra) if (lsbc) IF (lsbc) CALL trcsbc ! surface cell processes, default lsbc = False + !$acc update host(tra) if (lsbc) call tstop("trcsbc") call tstart("trcsms") diff --git a/src/PHYS/trcsbc.f90 b/src/PHYS/trcsbc.f90 index 752dbe2f..29aa0a12 100644 --- a/src/PHYS/trcsbc.f90 +++ b/src/PHYS/trcsbc.f90 @@ -18,19 +18,20 @@ SUBROUTINE trcsbc ! Conc/dilution process - DO jn=1,jptra - DO ji = 1, jpi - DO jj = 1, jpj + !$acc parallel loop collapse(3) default(present) + DO jn=1,jptra + DO ji = 1, jpi + DO jj = 1, jpj zse3t = 1. / e3t(1,jj,ji) - ztra = 1./ rhopn(1,jj,ji) * zse3t * tmask(1,jj,ji) * emp(jj,ji) * trn(1,jj,ji,jn) ! original emps(jj,ji) - tra(1,jj,ji,jn) = tra(1,jj,ji,jn) + ztra + ztra = 1./ rhopn(1,jj,ji) * zse3t * tmask(1,jj,ji) * emp(jj,ji) * trn(1,jj,ji,jn) ! original emps(jj,ji) + tra(1,jj,ji,jn) = tra(1,jj,ji,jn) + ztra - END DO - END DO + END DO + END DO ENDDO - trcsbcparttime = MPI_WTIME() - trcsbcparttime - trcsbctottime = trcsbctottime + trcsbcparttime + trcsbcparttime = MPI_WTIME() - trcsbcparttime + trcsbctottime = trcsbctottime + trcsbcparttime END SUBROUTINE trcsbc From 50bafc6060619b48bb83950cca0b1f497ca118c3 Mon Sep 17 00:00:00 2001 From: Loris Lucido Date: Fri, 17 May 2024 16:03:09 +0200 Subject: [PATCH 24/33] trcdmp GPU port --- src/IO/BC_mem.f90 | 15 ++++++++++----- src/PHYS/trcdmp.f90 | 17 +++++++++-------- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/src/IO/BC_mem.f90 b/src/IO/BC_mem.f90 index 0cae6af8..1668b052 100644 --- a/src/IO/BC_mem.f90 +++ b/src/IO/BC_mem.f90 @@ -166,15 +166,17 @@ SUBROUTINE alloc_DTATRC ! ENDIF IF ((lat .NE. 0) .AND. (lon .NE. 0)) THEN - allocate(tra_matrix_atm(jn_atm)) - tra_matrix_atm = huge(tra_matrix_atm(1)) - allocate(atm_aux(jpj,jpi)) + allocate(tra_matrix_atm(jn_atm)) + !$acc enter data create(tra_matrix_atm) + tra_matrix_atm = huge(tra_matrix_atm(1)) + allocate(atm_aux(jpj,jpi)) atm_aux = huge(atm_aux(1,1)) allocate(atm_idxtglo( jpj,jpi)) atm_idxtglo = huge(atm_idxtglo(1,1)) tra_matrix_atm(1) = ppN1p ! phosphates tra_matrix_atm(2) = ppN3n ! nitrates + !$acc update device(tra_matrix_atm) ENDIF @@ -270,8 +272,9 @@ SUBROUTINE alloc_DTATRC_local_atm() allocate(atm_dtatrc(jpj,jpi, 2, jn_atm)) atm_dtatrc = huge(atm_dtatrc(1,1,1,1)) - allocate(atm (jpj,jpi, jn_atm)) - atm = huge(atm(1,1,1)) + allocate(atm (jpj,jpi, jn_atm)) + !$acc enter data create(atm) + atm = huge(atm(1,1,1)) #ifdef Mem_Monitor @@ -287,12 +290,14 @@ subroutine clean_memory_bc() deallocate(resto) if ((lat /= 0) .and. (lon /= 0)) then + !$acc exit data delete(tra_matrix_atm) deallocate(tra_matrix_atm) deallocate(atm_aux) deallocate(atm_idxtglo) endif deallocate(atm_dtatrc) + !$acc exit data delete(atm) deallocate(atm) end subroutine clean_memory_bc diff --git a/src/PHYS/trcdmp.f90 b/src/PHYS/trcdmp.f90 index c6e5d58a..8fc8d332 100644 --- a/src/PHYS/trcdmp.f90 +++ b/src/PHYS/trcdmp.f90 @@ -48,14 +48,15 @@ SUBROUTINE trcdmp IF ( latmosph ) THEN - DO jn=1, jn_atm - tra_idx=tra_matrix_atm(jn) - DO ji=1,jpi - DO jj=1,jpj - tra(1,jj,ji,tra_idx) = tra(1,jj,ji,tra_idx) + atm(jj,ji,jn)/e3t(1,jj,ji) - ENDDO - ENDDO - ENDDO + !$acc parallel loop collapse(3) default(present) + DO jn=1, jn_atm + DO ji=1,jpi + DO jj=1,jpj + tra_idx=tra_matrix_atm(jn) + tra(1,jj,ji,tra_idx) = tra(1,jj,ji,tra_idx) + atm(jj,ji,jn)/e3t(1,jj,ji) + ENDDO + ENDDO + ENDDO ENDIF From f092b53571a346a5d8e563c56ccac8b6b2331d47 Mon Sep 17 00:00:00 2001 From: Loris Lucido Date: Fri, 17 May 2024 16:03:31 +0200 Subject: [PATCH 25/33] simplify trcadv GPU transfers and put them in step.f90 --- src/General/step.f90 | 9 ++++++++- src/PHYS/trcadv.f90 | 26 +------------------------- 2 files changed, 9 insertions(+), 26 deletions(-) diff --git a/src/General/step.f90 b/src/General/step.f90 index c6eb83e6..2089e5a3 100644 --- a/src/General/step.f90 +++ b/src/General/step.f90 @@ -296,6 +296,7 @@ SUBROUTINE trcstp use BIO_mem, only: ogstm_sediPI,ogstm_PH,ogstm_co2 USE OPT_mem, only: kef USE SED_mem + USE ADV_mem use simple_timer IMPLICIT NONE @@ -303,12 +304,18 @@ SUBROUTINE trcstp trcstpparttime = MPI_WTIME() ! cronometer-start call tstart("trcadv") - IF (ladv) CALL trcadv ! tracers: advection + + !$acc update device(zaa,zbb,zcc,inv_eu,inv_ev,inv_et,big_fact_zaa,big_fact_zbb,big_fact_zcc,zbtr_arr,e1t,e2t,e3t,e1u,e2u,e3u,e1v,e2v,e3v,e3w,un,vn,wn,tra,trn,advmask,flx_ridxt,diaflx) + IF (ladv) CALL trcadv ! tracers advection + !$acc update host(zaa,zbb,zcc,inv_eu,inv_ev,inv_et,big_fact_zaa,big_fact_zbb,big_fact_zcc,zbtr_arr,diaflx,tra) + call tstop("trcadv") # if defined key_trc_dmp call tstart("trcdmp") + !$acc update device(tra,atm,e3t) if(latmosph) CALL trcdmp ! tracers: damping for passive tracerstrcstp + !$acc update host(tra) if(latmosph) call tstop("trcdmp") ! ---------------------------------------------------------------------- diff --git a/src/PHYS/trcadv.f90 b/src/PHYS/trcadv.f90 index 70b703fc..dc4b4cb8 100644 --- a/src/PHYS/trcadv.f90 +++ b/src/PHYS/trcadv.f90 @@ -184,6 +184,7 @@ SUBROUTINE trcadv allocate(zbuf(jpk,jpj,jpi)) !$acc enter data create(zy,zx,zz,ztj,zti,zkx,zky,zkz,zbuf) + !$acc update device(zaa,zbb,zcc,inv_eu,inv_ev,inv_et,big_fact_zaa,big_fact_zbb,big_fact_zcc,zbtr_arr,advmask) call tstop("trcadv_alloc") @@ -202,23 +203,6 @@ SUBROUTINE trcadv zdt = rdt*ndttrc !$OMP TASK private(ji,jj) firstprivate(jpim1,jpjm1) shared(zbtr_arr,e1t,e2t,e3t) default(none) - !$acc update device( zaa(1:jpk,1:jpj,1:jpi), zbb(1:jpk,1:jpj,1:jpi), zcc(1:jpk,1:jpj,1:jpi) ) - !$acc update device( inv_eu(1:jpk,1:jpj,1:jpi), inv_ev(1:jpk,1:jpj,1:jpi), inv_et(1:jpk,1:jpj,1:jpi) ) - !$acc update device( big_fact_zaa (1:jpk,1:jpj,1:jpi), big_fact_zbb(1:jpk,1:jpj,1:jpi), big_fact_zcc(1:jpk,1:jpj,1:jpi) ) - !$acc update device( zbtr_arr(1:jpk,1:jpj,1:jpi) ) - - !$acc update device( e1t(1:jpj,1:jpi), e2t(1:jpj,1:jpi), e3t(1:jpk,1:jpj,1:jpi) ) - !$acc update device( e1u(1:jpj,1:jpi), e2u(1:jpj,1:jpi), e3u(1:jpk,1:jpj,1:jpi) ) - !$acc update device( e1v(1:jpj,1:jpi), e2v(1:jpj,1:jpi), e3v(1:jpk,1:jpj,1:jpi) ) - !$acc update device( e3w(1:jpk,1:jpj,1:jpi) ) - !$acc update device( un(1:jpk,1:jpj,1:jpi), vn(1:jpk,1:jpj,1:jpi), wn(1:jpk,1:jpj,1:jpi) ) - - !$acc update device(tra(1:jpk,1:jpj,1:jpi,1:jptra)) - !$acc update device(trn(1:jpk,1:jpj,1:jpi,1:jptra)) - !$acc update device(advmask(1:jpk,1:jpj,1:jpi)) - !$acc update device(flx_ridxt(1:Fsize,1:4)) - !$acc update device( diaflx(1:7, 1:Fsize, 1:jptra)) - call tstart("trcadv_1") !$acc kernels default(present) async(queue) @@ -945,14 +929,6 @@ SUBROUTINE trcadv !$OMP end taskloop - !$acc update host( zaa(1:jpk,1:jpj,1:jpi), zbb(1:jpk,1:jpj,1:jpi), zcc(1:jpk,1:jpj,1:jpi) ) - !$acc update host( inv_eu(1:jpk,1:jpj,1:jpi), inv_ev(1:jpk,1:jpj,1:jpi), inv_et(1:jpk,1:jpj,1:jpi) ) - !$acc update host( big_fact_zaa (1:jpk,1:jpj,1:jpi), big_fact_zbb(1:jpk,1:jpj,1:jpi), big_fact_zcc(1:jpk,1:jpj,1:jpi) ) - !$acc update host( zbtr_arr(1:jpk,1:jpj,1:jpi) ) - - !$acc update host( diaflx(1:7, 1:Fsize, 1:jptra) ) - !$acc update host( tra(1:jpk,1:jpj,1:jpi,1:jptra) ) - !!OpenMP compatibility broken. Possibility to use ifndef OpenMP + rename the file in trcadv.F90 to keep it trcadvparttime = MPI_WTIME() - trcadvparttime From 6d8d3a4e16a546e49cf0d4f31f02028a69ae7175 Mon Sep 17 00:00:00 2001 From: Loris Lucido Date: Fri, 17 May 2024 16:45:34 +0200 Subject: [PATCH 26/33] move trcadv transfers --- src/General/step.f90 | 58 +++++++++++++++++++++----------------------- 1 file changed, 27 insertions(+), 31 deletions(-) diff --git a/src/General/step.f90 b/src/General/step.f90 index 2089e5a3..6a9b221b 100644 --- a/src/General/step.f90 +++ b/src/General/step.f90 @@ -57,6 +57,14 @@ SUBROUTINE step ! trcstp, trcdia passive tracers interface + ! XXX: to be removed + use DIA_mem, only: diaflx,flx_ridxt + use myalloc, only: tra,trb,e1t,e3t_back,e2t,e3t,e3w,umask,vmask,tmask,avt,ahtt + use BIO_mem, only: ogstm_sediPI,ogstm_PH,ogstm_co2 + USE OPT_mem, only: kef + USE SED_mem + USE ADV_mem + use simple_timer IMPLICIT NONE @@ -190,7 +198,25 @@ SUBROUTINE step ! Call Passive tracer model between synchronization for small parallelisation call tstart("trcstp_all") + !$acc update device(tra,tmask) + !$acc update device(zaa,zbb,zcc,inv_eu,inv_ev,inv_et,big_fact_zaa,big_fact_zbb,big_fact_zcc,zbtr_arr,e1t,e2t,e3t,e1u,e2u,e3u,e1v,e2v,e3v,e3w,un,vn,wn,trn,advmask,flx_ridxt,diaflx) if(ladv) + !$acc update device(atm,e3t) if(latmosph) + !$acc update device(umask,vmask,trb,ahtt,diaflx,flx_ridxt) if(lhdf) + !$acc update device(e3t,rhopn,emp,trn) if (lsbc) + !$acc update device(kef,qsr,mbathy,bfmmask,trn,DAY_LENGTH,vatm,tn,sn,rho,e3t,gdept,ogstm_PH,ogstm_co2) if(lbfm) +#if defined key_trc_sed + !$acc update device(sed_idx,diaflx,e3t,ogstm_sedipi,mbathy) if(lbfm) +#endif + !$acc update device(e1t,diaflx,e3t_back,e2t,trb,e3t,avt,e3w) if (lzdf) CALL trcstp ! se commento questo non fa calcoli + !$acc update host(trb,trn,tra) + !$acc update host(diaflx) if(lhdf) + !$acc update host(zaa,zbb,zcc,inv_eu,inv_ev,inv_et,big_fact_zaa,big_fact_zbb,big_fact_zcc,zbtr_arr,diaflx) if(ladv) + !$acc update host(tra_DIA,tra_DIA_2d,ogstm_sediPI,ogstm_PH) if(lbfm) +#if defined key_trc_sed + !$acc update host(diaflx,zwork) if(lbfm) +#endif + !$acc update host(diaflx) if (lzdf) call tstop("trcstp_all") call tstart("trcave") call trcave @@ -290,14 +316,6 @@ SUBROUTINE trcstp ! with surface boundary condition ! with IMPLICIT vertical diffusion - ! XXX: to be removed - use DIA_mem, only: diaflx,flx_ridxt - use myalloc, only: tra,trb,e1t,e3t_back,e2t,e3t,e3w,umask,vmask,tmask,avt,ahtt - use BIO_mem, only: ogstm_sediPI,ogstm_PH,ogstm_co2 - USE OPT_mem, only: kef - USE SED_mem - USE ADV_mem - use simple_timer IMPLICIT NONE integer jn,jk,ji,jj @@ -305,17 +323,13 @@ SUBROUTINE trcstp call tstart("trcadv") - !$acc update device(zaa,zbb,zcc,inv_eu,inv_ev,inv_et,big_fact_zaa,big_fact_zbb,big_fact_zcc,zbtr_arr,e1t,e2t,e3t,e1u,e2u,e3u,e1v,e2v,e3v,e3w,un,vn,wn,tra,trn,advmask,flx_ridxt,diaflx) IF (ladv) CALL trcadv ! tracers advection - !$acc update host(zaa,zbb,zcc,inv_eu,inv_ev,inv_et,big_fact_zaa,big_fact_zbb,big_fact_zcc,zbtr_arr,diaflx,tra) call tstop("trcadv") # if defined key_trc_dmp call tstart("trcdmp") - !$acc update device(tra,atm,e3t) if(latmosph) CALL trcdmp ! tracers: damping for passive tracerstrcstp - !$acc update host(tra) if(latmosph) call tstop("trcdmp") ! ---------------------------------------------------------------------- @@ -336,44 +350,28 @@ SUBROUTINE trcstp ! ----------------------------- call tstart("trchdf") - !$acc update device(umask,vmask,tmask,trb,ahtt,tra,diaflx,flx_ridxt) if(lhdf) IF (lhdf) CALL trchdf - !$acc update host(diaflx,tra) if(lhdf) call tstop("trchdf") ! tracers: sink and source (must be parallelized on vertical slab) call tstart("trcsbc") - !$acc update device(e3t,rhopn,tmask,emp,trn,tra) if (lsbc) IF (lsbc) CALL trcsbc ! surface cell processes, default lsbc = False - !$acc update host(tra) if (lsbc) call tstop("trcsbc") call tstart("trcsms") - !$acc update device(kef,qsr,mbathy,bfmmask,trn,DAY_LENGTH,vatm,tn,sn,rho,e3t,gdept,ogstm_PH,ogstm_co2) if(lbfm) -#if defined key_trc_sed - !$acc update device(sed_idx,diaflx,e3t,tra,ogstm_sedipi,mbathy) if(lbfm) -#endif IF (lbfm) CALL trcsms - !$acc update host(tra,tra_DIA,tra_DIA_2d,ogstm_sediPI,ogstm_PH) if(lbfm) -#if defined key_trc_sed - !$acc update host(diaflx,zwork) if(lbfm) -#endif call tstop("trcsms") call tstart("trczdf") - !$acc update device(e1t,diaflx,e3t_back,e2t,trb,tmask,e3t,tra,avt,e3w) if (lzdf) IF (lzdf) CALL trczdf ! tracers: vertical diffusion - !$acc update host(diaflx,tra) if (lzdf) call tstop("trczdf") call tstart("snutel") - !$acc update device(tmask,tra) if(lsnu) IF (lsnu) CALL snutel - !$acc update host(tra) if(lsnu) call tstop("snutel") call boundaries%apply_dirichlet() @@ -381,11 +379,9 @@ SUBROUTINE trcstp ! CALL checkValues call tstart("trcnxt") - !$acc update device(tra,tmask) CALL trcnxt ! tracers: fields at next time step - !$acc update host(trb,trn,tra) call tstop("trcnxt") - + trcstpparttime = MPI_WTIME() - trcstpparttime ! cronometer-stop trcstptottime = trcstptottime + trcstpparttime From e7e71411177afc108f6bc55daefe84aabdc5079a Mon Sep 17 00:00:00 2001 From: Loris Lucido Date: Fri, 17 May 2024 18:21:31 +0200 Subject: [PATCH 27/33] move trcave transfers --- src/General/step.f90 | 19 +++++++++++-------- src/PHYS/trcave.f90 | 3 --- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/General/step.f90 b/src/General/step.f90 index 6a9b221b..5afd7f41 100644 --- a/src/General/step.f90 +++ b/src/General/step.f90 @@ -194,10 +194,6 @@ SUBROUTINE step call tstop("data_assim") #endif - - -! Call Passive tracer model between synchronization for small parallelisation - call tstart("trcstp_all") !$acc update device(tra,tmask) !$acc update device(zaa,zbb,zcc,inv_eu,inv_ev,inv_et,big_fact_zaa,big_fact_zbb,big_fact_zcc,zbtr_arr,e1t,e2t,e3t,e1u,e2u,e3u,e1v,e2v,e3v,e3w,un,vn,wn,trn,advmask,flx_ridxt,diaflx) if(ladv) !$acc update device(atm,e3t) if(latmosph) @@ -208,7 +204,17 @@ SUBROUTINE step !$acc update device(sed_idx,diaflx,e3t,ogstm_sedipi,mbathy) if(lbfm) #endif !$acc update device(e1t,diaflx,e3t_back,e2t,trb,e3t,avt,e3w) if (lzdf) + !$acc update device(traIO,trn,umask,vmask,tmask,traIO_HIGH,highfreq_table,snIO,tnIO,wnIO,avtIO,e3tIO,unIO,vnIO,sn,tn,wn,avt,e3t,un,vn,tra_DIA_IO,tra_DIA,tra_DIA_2d_IO,tra_DIA_2d,vatmIO,empIO,qsrIO,vatm,emp,qsr,highfreq_table_dia,tra_DIA_IO_HIGH,tra_DIA_2d_IO_HIGH,highfreq_table_dia2d) + + +! Call Passive tracer model between synchronization for small parallelisation + call tstart("trcstp_all") CALL trcstp ! se commento questo non fa calcoli + call tstop("trcstp_all") + call tstart("trcave") + call trcave + call tstop("trcave") + !$acc update host(trb,trn,tra) !$acc update host(diaflx) if(lhdf) !$acc update host(zaa,zbb,zcc,inv_eu,inv_ev,inv_et,big_fact_zaa,big_fact_zbb,big_fact_zcc,zbtr_arr,diaflx) if(ladv) @@ -217,10 +223,7 @@ SUBROUTINE step !$acc update host(diaflx,zwork) if(lbfm) #endif !$acc update host(diaflx) if (lzdf) - call tstop("trcstp_all") - call tstart("trcave") - call trcave - call tstop("trcave") + !$acc update host(traIO,traIO_HIGH,snIO,tnIO,wnIO,avtIO,e3tIO,unIO,vnIO,vatmIO,empIO,qsrIO,tra_DIA_IO,tra_DIA_2d_IO,tra_DIA_2d,tra_DIA_IO_HIGH,tra_DIA_2d_IO_HIGH) elapsed_time_1 = elapsed_time_1 + rdt elapsed_time_2 = elapsed_time_2 + rdt diff --git a/src/PHYS/trcave.f90 b/src/PHYS/trcave.f90 index 617a0212..fe2d642d 100644 --- a/src/PHYS/trcave.f90 +++ b/src/PHYS/trcave.f90 @@ -20,8 +20,6 @@ SUBROUTINE trcave elapsed_time = elapsed_time_2 inv_incremented_time = 1./(elapsed_time_2 + rdt) - !$acc update device(traIO,trn,umask,vmask,tmask,traIO_HIGH,highfreq_table,snIO,tnIO,wnIO,avtIO,e3tIO,unIO,vnIO,sn,tn,wn,avt,e3t,un,vn,tra_DIA_IO,tra_DIA,tra_DIA_2d_IO,tra_DIA_2d,vatmIO,empIO,qsrIO,vatm,emp,qsr,highfreq_table_dia,tra_DIA_IO_HIGH,tra_DIA_2d_IO_HIGH,highfreq_table_dia2d) - !$acc parallel loop gang vector collapse(4) default(present) async(queue) DO jn=1 ,jptra @@ -243,7 +241,6 @@ SUBROUTINE trcave endif ! lfbm !$acc wait(queue) - !$acc update host(traIO,traIO_HIGH,snIO,tnIO,wnIO,avtIO,e3tIO,unIO,vnIO,vatmIO,empIO,qsrIO,tra_DIA_IO,tra_DIA_2d_IO,tra_DIA_2d,tra_DIA_IO_HIGH,tra_DIA_2d_IO_HIGH) ave_partTime = MPI_WTIME() - ave_partTime ave_TotTime = ave_TotTime + ave_partTime From d1e98646f37f0a37513025b1433a9d0f65acb97a Mon Sep 17 00:00:00 2001 From: Loris Lucido Date: Fri, 17 May 2024 18:32:15 +0200 Subject: [PATCH 28/33] step: more timers --- src/General/step.f90 | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/src/General/step.f90 b/src/General/step.f90 index 5afd7f41..50b932fc 100644 --- a/src/General/step.f90 +++ b/src/General/step.f90 @@ -93,7 +93,7 @@ SUBROUTINE step call tstart("step_total") DO WHILE (.not.ISOVERTIME(datestring)) - + call tstart("step") call tstart("step_1") stpparttime = MPI_WTIME() ! stop cronomether COMMON_DATESTRING = DATEstring @@ -135,10 +135,12 @@ SUBROUTINE step ! For offline simulation READ DATA or precalculalted dynamics fields ! ------------------------------------------------------------------ - call tstart("forcing") + call tstart("forcing_phys") CALL forcings_PHYS(DATEstring) + call tstop("forcing_phys") + call tstart("forcing_kext") CALL forcings_KEXT(datestring) - call tstop("forcing") + call tstop("forcing_kext") ! ---------------------------------------------------------------------- ! BEGIN BC_REFACTORING SECTION @@ -152,11 +154,15 @@ SUBROUTINE step ! END BC_REFACTORING SECTION ! --------------------------------------------------------------------- - call tstart("bc+eos") + call tstart("bc_atm") CALL bc_atm (DATEstring) ! CALL dtatrc(istp,2) + call tstop("bc_atm") + call tstart("bc_co2") CALL bc_co2 (DATEstring) + call tstop("bc_co2") + call tstart("eos") CALL eos () ! Water density - call tstop("bc+eos") + call tstop("eos") @@ -208,9 +214,9 @@ SUBROUTINE step ! Call Passive tracer model between synchronization for small parallelisation - call tstart("trcstp_all") + call tstart("trcstp") CALL trcstp ! se commento questo non fa calcoli - call tstop("trcstp_all") + call tstop("trcstp") call tstart("trcave") call trcave call tstop("trcave") @@ -277,7 +283,8 @@ SUBROUTINE step !+++++++++++++++++++++++++++++c datestring = UPDATE_TIMESTRING(datestring, rdt) TAU = TAU + 1 - END DO + call tstop("step") + END DO CONTAINS From 40280d51f485f690cbe90e07db1bc3b00e6b2dec Mon Sep 17 00:00:00 2001 From: Loris Lucido Date: Thu, 4 Jul 2024 15:47:58 +0200 Subject: [PATCH 29/33] leonardo.nvhpc: fix module version --- compilers/machine_modules/leonardo.nvhpc | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/compilers/machine_modules/leonardo.nvhpc b/compilers/machine_modules/leonardo.nvhpc index 644e0093..0b2664f3 100644 --- a/compilers/machine_modules/leonardo.nvhpc +++ b/compilers/machine_modules/leonardo.nvhpc @@ -1,12 +1,14 @@ module load cmake -module load nvhpc -module load cuda +module load nvhpc/23.11 +module load openmpi/4.1.6--nvhpc--23.11 module load netcdf-fortran/4.6.1--openmpi--4.1.6--nvhpc--23.11 -export PATH=$(echo $PATH | sed -r 's|cuda/samples/bin/x86_64/linux/release/:||') +module load cuda/12.3 export CC=pgcc export NETCDF_CFLAGS=$(nc-config --cflags) export NETCDF_CLIBS=$(nc-config --libs) +export NETCDF_FFLAGS=$(nc-config --fflags) +export NETCDF_FLIBS=$(nc-config --flibs) export NETCDF_LIB=$(nc-config --libdir) export NETCDF_INC=$(nc-config --includedir) From f72f9c8d69edfc89e54280fbb01cd556c9db48ca Mon Sep 17 00:00:00 2001 From: Stefano Campanella <15182642+stefanocampanella@users.noreply.github.com> Date: Wed, 17 Jul 2024 17:48:17 +0200 Subject: [PATCH 30/33] Fix leonardo.nvhpc and leonardo.intel modules scripts --- compilers/machine_modules/leonardo.intel | 8 ++++++++ compilers/machine_modules/leonardo.nvhpc | 8 +++----- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/compilers/machine_modules/leonardo.intel b/compilers/machine_modules/leonardo.intel index 73bb0586..123d9a69 100644 --- a/compilers/machine_modules/leonardo.intel +++ b/compilers/machine_modules/leonardo.intel @@ -12,3 +12,11 @@ export NETCDF_FFLAGS=$(nf-config --fflags) export NETCDF_FLIBS=$(nf-config --flibs) export NETCDFF_LIB=$(nf-config --prefix)/lib export NETCDFF_INC=$(nf-config --includedir) + +SIMPLE_TIMER_ROOT=/leonardo_work/OGS23_PRACE_IT_0/llucido0/simple-timer/nvhpc--23.11_cuda--12.3_openmpi--4.1.6 +export SIMPLE_TIMER_INCLUDE_DIR="${SIMPLE_TIMER_ROOT}/include" +export SIMPLE_TIMER_FLAGS="-I ${SIMPLE_TIMER_INCLUDE_DIR}" +export SIMPLE_TIMER_LIB_DIR="${SIMPLE_TIMER_ROOT}/lib" +export SIMPLE_TIMER_LIBS="-L ${SIMPLE_TIMER_LIB_DIR} -lsimple_timer -lsimple_timer_f" + +export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:${SIMPLE_TIMER_LIB_DIR}" diff --git a/compilers/machine_modules/leonardo.nvhpc b/compilers/machine_modules/leonardo.nvhpc index 0b2664f3..644e0093 100644 --- a/compilers/machine_modules/leonardo.nvhpc +++ b/compilers/machine_modules/leonardo.nvhpc @@ -1,14 +1,12 @@ module load cmake -module load nvhpc/23.11 -module load openmpi/4.1.6--nvhpc--23.11 +module load nvhpc +module load cuda module load netcdf-fortran/4.6.1--openmpi--4.1.6--nvhpc--23.11 -module load cuda/12.3 +export PATH=$(echo $PATH | sed -r 's|cuda/samples/bin/x86_64/linux/release/:||') export CC=pgcc export NETCDF_CFLAGS=$(nc-config --cflags) export NETCDF_CLIBS=$(nc-config --libs) -export NETCDF_FFLAGS=$(nc-config --fflags) -export NETCDF_FLIBS=$(nc-config --flibs) export NETCDF_LIB=$(nc-config --libdir) export NETCDF_INC=$(nc-config --includedir) From 71596d90cc648305546ceaa62121781cf3a5dc1e Mon Sep 17 00:00:00 2001 From: Loris Lucido Date: Mon, 29 Jul 2024 09:43:44 +0200 Subject: [PATCH 31/33] fix Intel simple-timer location --- compilers/machine_modules/leonardo.intel | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compilers/machine_modules/leonardo.intel b/compilers/machine_modules/leonardo.intel index 123d9a69..758bf30c 100644 --- a/compilers/machine_modules/leonardo.intel +++ b/compilers/machine_modules/leonardo.intel @@ -13,7 +13,7 @@ export NETCDF_FLIBS=$(nf-config --flibs) export NETCDFF_LIB=$(nf-config --prefix)/lib export NETCDFF_INC=$(nf-config --includedir) -SIMPLE_TIMER_ROOT=/leonardo_work/OGS23_PRACE_IT_0/llucido0/simple-timer/nvhpc--23.11_cuda--12.3_openmpi--4.1.6 +SIMPLE_TIMER_ROOT=/leonardo_work/OGS23_PRACE_IT_0/llucido0/simple-timer/oneapi--2023.2.0_intelmpi--2021.10.0 export SIMPLE_TIMER_INCLUDE_DIR="${SIMPLE_TIMER_ROOT}/include" export SIMPLE_TIMER_FLAGS="-I ${SIMPLE_TIMER_INCLUDE_DIR}" export SIMPLE_TIMER_LIB_DIR="${SIMPLE_TIMER_ROOT}/lib" From 9633399f4df2bb8c09cbba42f35d5d9fe9c72058 Mon Sep 17 00:00:00 2001 From: Loris Lucido Date: Thu, 1 Aug 2024 16:52:29 +0200 Subject: [PATCH 32/33] trchdf: allocate only once --- src/PHYS/trchdf.f90 | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/src/PHYS/trchdf.f90 b/src/PHYS/trchdf.f90 index 2d4fc889..7084e6d5 100644 --- a/src/PHYS/trchdf.f90 +++ b/src/PHYS/trchdf.f90 @@ -110,7 +110,7 @@ SUBROUTINE trchdf INTEGER :: myji,myjj INTEGER :: locsum,jklef,jjlef,jilef,jkrig,jjrig,jirig !INTEGER, allocatable :: jarr_hdf(:,:,:),jarr_hdf_flx(:) - double precision, allocatable,dimension(:,:,:) :: zlt, ztu, ztv + double precision, allocatable,dimension(:,:,:),save :: zlt, ztu, ztv integer :: queue logical :: use_gpu !!---------------------------------------------------------------------- @@ -185,12 +185,11 @@ SUBROUTINE trchdf !$acc update device(hdfmask) hdf_initialized=.true. - ENDIF - - allocate(zlt (jpk,jpj,jpi)) - allocate(ztu (jpk,jpj,jpi)) - allocate(ztv (jpk,jpj,jpi)) + allocate(zlt(jpk,jpj,jpi)) + allocate(ztu(jpk,jpj,jpi)) + allocate(ztv(jpk,jpj,jpi)) !$acc enter data create(zlt,ztu,ztv) + ENDIF ! Metric arrays calculated out of the initialisation phase(for z- or s-coordinates) ! !! ---------------------------------- @@ -403,17 +402,6 @@ SUBROUTINE trchdf call tstop("trchdf_tracer") call tstart("trchdf_2") - ! deallocate(hdfmask) - ! deallocate(zeeu) - ! deallocate(zeev) - ! deallocate(zbtr) - - !$acc exit data delete(zlt,ztu,ztv) - deallocate(zlt) - deallocate(ztu) - deallocate(ztv) - - trcbilaphdfparttime = MPI_WTIME() - trcbilaphdfparttime trcbilaphdftottime = trcbilaphdftottime + trcbilaphdfparttime From 7b6423532dd0c19b6d70f678385c87bb62bbff07 Mon Sep 17 00:00:00 2001 From: Loris Lucido Date: Tue, 6 Aug 2024 16:07:55 +0200 Subject: [PATCH 33/33] activate simple_timer for bfm --- compilers/machine_modules/leonardo.intel | 1 + compilers/machine_modules/leonardo.nvhpc | 1 + 2 files changed, 2 insertions(+) diff --git a/compilers/machine_modules/leonardo.intel b/compilers/machine_modules/leonardo.intel index 758bf30c..3a08fd1a 100644 --- a/compilers/machine_modules/leonardo.intel +++ b/compilers/machine_modules/leonardo.intel @@ -18,5 +18,6 @@ export SIMPLE_TIMER_INCLUDE_DIR="${SIMPLE_TIMER_ROOT}/include" export SIMPLE_TIMER_FLAGS="-I ${SIMPLE_TIMER_INCLUDE_DIR}" export SIMPLE_TIMER_LIB_DIR="${SIMPLE_TIMER_ROOT}/lib" export SIMPLE_TIMER_LIBS="-L ${SIMPLE_TIMER_LIB_DIR} -lsimple_timer -lsimple_timer_f" +export BFM_TIMER_DEFINITIONS="-DBFM_USE_SIMPLE_TIMER" export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:${SIMPLE_TIMER_LIB_DIR}" diff --git a/compilers/machine_modules/leonardo.nvhpc b/compilers/machine_modules/leonardo.nvhpc index 644e0093..a7a7c4b5 100644 --- a/compilers/machine_modules/leonardo.nvhpc +++ b/compilers/machine_modules/leonardo.nvhpc @@ -21,5 +21,6 @@ export SIMPLE_TIMER_INCLUDE_DIR="${SIMPLE_TIMER_ROOT}/include" export SIMPLE_TIMER_FLAGS="-I ${SIMPLE_TIMER_INCLUDE_DIR}" export SIMPLE_TIMER_LIB_DIR="${SIMPLE_TIMER_ROOT}/lib" export SIMPLE_TIMER_LIBS="-L ${SIMPLE_TIMER_LIB_DIR} -lsimple_timer -lsimple_timer_f" +export BFM_TIMER_DEFINITIONS="-DBFM_USE_SIMPLE_TIMER" export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:${SIMPLE_TIMER_LIB_DIR}"