Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[fixed] trcnxt, snutel, trcdmp, trcsbc GPU port + optim #8

Merged
merged 33 commits into from
Aug 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
a6f7349
add timers in ogstm
dindon-sournois Feb 26, 2024
5fdc7fe
add -Minfo=all for release flag
dindon-sournois Feb 27, 2024
9d1f287
add Minfo=accel and fix one timer
dindon-sournois Apr 22, 2024
a42298d
port trczdf to GPU
dindon-sournois Apr 15, 2024
fed91cb
reduce vector length
dindon-sournois Apr 16, 2024
a066992
move allocations, put async everywhere, use fsx macro
dindon-sournois Apr 24, 2024
ccb2346
fix serial kernel
dindon-sournois Apr 24, 2024
9340275
more timers
dindon-sournois Jul 25, 2024
bf39f7d
remove duplicated mpplnk_my_openacc routine, just use mpplnk_my
dindon-sournois May 7, 2024
43ff973
port trchdf on GPU
dindon-sournois May 7, 2024
0772e24
mpplnk_my timer
dindon-sournois May 7, 2024
9d71a04
trcave GPU port
dindon-sournois May 7, 2024
c3fc5a1
remove debug statement
dindon-sournois May 13, 2024
3918ff0
workaround: data not present on device
dindon-sournois May 13, 2024
7864fa8
trcbio GPU port
dindon-sournois May 13, 2024
df705bb
BFM1D_Input_EcologyDynamics gpu port
dindon-sournois May 14, 2024
5973192
BFM1D_Output_EcologyDynamics gpu port
dindon-sournois May 14, 2024
6d62722
add more timers in trcsms
dindon-sournois May 15, 2024
73eaf3f
trcopt GPU port, XXX: could not be tested
dindon-sournois May 15, 2024
c0527c6
trcsed GPU port
dindon-sournois May 15, 2024
0f6df5b
snutel and trcnxt GPU port
dindon-sournois May 17, 2024
63616e2
trcadv: allocate once
dindon-sournois May 17, 2024
e8a67dc
trcsbc GPU port
dindon-sournois May 17, 2024
50bafc6
trcdmp GPU port
dindon-sournois May 17, 2024
f092b53
simplify trcadv GPU transfers and put them in step.f90
dindon-sournois May 17, 2024
6d8d3a4
move trcadv transfers
dindon-sournois May 17, 2024
e7e7141
move trcave transfers
dindon-sournois May 17, 2024
d1e9864
step: more timers
dindon-sournois May 17, 2024
40280d5
leonardo.nvhpc: fix module version
dindon-sournois Jul 4, 2024
f72f9c8
Fix leonardo.nvhpc and leonardo.intel modules scripts
stefanocampanella Jul 17, 2024
71596d9
fix Intel simple-timer location
dindon-sournois Jul 29, 2024
9633399
trchdf: allocate only once
dindon-sournois Aug 1, 2024
7b64235
activate simple_timer for bfm
dindon-sournois Aug 6, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions GeneralCmake.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ elseif(CMAKE_Fortran_COMPILER_ID MATCHES "GNU")
set(CMAKE_Fortran_FLAGS_DEBUG " -Og -ggdb3 -fimplicit-none -cpp -ffree-line-length-none -Wall -Wextra -fno-omit-frame-pointer -fbounds-check -pedantic -ffpe-trap=invalid,zero,overflow")
set(CMAKE_LINKER_FLAGS_DEBUG "${CMAKE_LINKER_FLAGS_DEBUG} -fno-omit-frame-pointer")
elseif(CMAKE_Fortran_COMPILER_ID MATCHES "NVHPC|PGI")
set(CMAKE_Fortran_FLAGS_RELEASE " -Kieee -g -traceback -fast -acc -gpu=pinned -Mextend -Mpreprocess")
set(CMAKE_Fortran_FLAGS_RELEASE " -Kieee -g -traceback -fast -acc -gpu=pinned -Mextend -Mpreprocess -Minfo=accel")
set(CMAKE_Fortran_FLAGS_DEBUG " -Kieee -g -traceback -O0 -acc -gpu=pinned,debug -Mbounds -Mextend -Mpreprocess -Minfo=accel")
else()
message ("CMAKE_Fortran_COMPILER full path: " ${CMAKE_Fortran_COMPILER})
Expand All @@ -56,6 +56,7 @@ link_directories(${MPI_Fortran_LIBRARIES})
include_directories(${BFM_INCLUDES})
include_directories(${NETCDF_INCLUDES_C})
include_directories(${NETCDFF_INCLUDES_F90})
include_directories($ENV{SIMPLE_TIMER_INCLUDE_DIR})

# Search Fortran module to compile
set( FOLDERS BIO General IO MPI namelists PHYS BC)
Expand All @@ -68,4 +69,4 @@ endforeach()
#building
add_library( ogstm_lib ${FORTRAN_SOURCES})
add_executable (ogstm.xx application/ogstm_main_caller.f90)
target_link_libraries(ogstm.xx ogstm_lib ${NETCDFF_LIBRARIES_F90} ${BFM_LIBRARIES} MPI::MPI_Fortran)
target_link_libraries(ogstm.xx ogstm_lib ${NETCDFF_LIBRARIES_F90} ${BFM_LIBRARIES} MPI::MPI_Fortran $ENV{SIMPLE_TIMER_LIBS})
2 changes: 2 additions & 0 deletions application/ogstm_main_caller.f90
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ PROGRAM OGSTM_MAIN
#ifdef _OPENACC
use openacc
#endif
use simple_timer
implicit none
integer :: info, ierr

Expand All @@ -34,6 +35,7 @@ PROGRAM OGSTM_MAIN
!$OMP END MASTER
!$OMP END PARALLEL

call tprint()
CALL mpi_finalize(info)

END PROGRAM OGSTM_MAIN
9 changes: 9 additions & 0 deletions compilers/machine_modules/leonardo.intel
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,12 @@ export NETCDF_FFLAGS=$(nf-config --fflags)
export NETCDF_FLIBS=$(nf-config --flibs)
export NETCDFF_LIB=$(nf-config --prefix)/lib
export NETCDFF_INC=$(nf-config --includedir)

SIMPLE_TIMER_ROOT=/leonardo_work/OGS23_PRACE_IT_0/llucido0/simple-timer/oneapi--2023.2.0_intelmpi--2021.10.0
export SIMPLE_TIMER_INCLUDE_DIR="${SIMPLE_TIMER_ROOT}/include"
export SIMPLE_TIMER_FLAGS="-I ${SIMPLE_TIMER_INCLUDE_DIR}"
export SIMPLE_TIMER_LIB_DIR="${SIMPLE_TIMER_ROOT}/lib"
export SIMPLE_TIMER_LIBS="-L ${SIMPLE_TIMER_LIB_DIR} -lsimple_timer -lsimple_timer_f"
export BFM_TIMER_DEFINITIONS="-DBFM_USE_SIMPLE_TIMER"

export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:${SIMPLE_TIMER_LIB_DIR}"
9 changes: 9 additions & 0 deletions compilers/machine_modules/leonardo.nvhpc
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,12 @@ export NETCDF_FFLAGS=$(nf-config --fflags)
export NETCDF_FLIBS=$(nf-config --flibs)
export NETCDFF_LIB=$(nf-config --prefix)/lib
export NETCDFF_INC=$(nf-config --includedir)

SIMPLE_TIMER_ROOT=/leonardo_work/OGS23_PRACE_IT_0/llucido0/simple-timer/nvhpc--23.11_cuda--12.3_openmpi--4.1.6
export SIMPLE_TIMER_INCLUDE_DIR="${SIMPLE_TIMER_ROOT}/include"
export SIMPLE_TIMER_FLAGS="-I ${SIMPLE_TIMER_INCLUDE_DIR}"
export SIMPLE_TIMER_LIB_DIR="${SIMPLE_TIMER_ROOT}/lib"
export SIMPLE_TIMER_LIBS="-L ${SIMPLE_TIMER_LIB_DIR} -lsimple_timer -lsimple_timer_f"
export BFM_TIMER_DEFINITIONS="-DBFM_USE_SIMPLE_TIMER"

export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:${SIMPLE_TIMER_LIB_DIR}"
17 changes: 16 additions & 1 deletion src/BIO/BIO_mem.f90
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ MODULE BIO_mem
double precision, allocatable :: ogstm_ph(:,:,:) ! GUESS for FOLLOWS algorithm
double precision, allocatable :: NPPF2(:,:,:)
double precision, allocatable :: ogstm_co2(:,:), co2_IO(:,:,:)
double precision, allocatable :: sediPPY(:,:)
double precision, allocatable :: local_D3DIAGNOS(:,:)
double precision, allocatable :: local_D2DIAGNOS(:,:)
double precision, allocatable :: er(:,:)
double precision:: ice


Expand All @@ -38,7 +42,7 @@ subroutine myalloc_BIO()
allocate(co2_IO(jpj,jpi,2))

co2_IO = huge(co2_IO(1,1,1))
allocate(ogstm_sedipi(jpk,jpj,jpi,4))
allocate(ogstm_sedipi(jpk,jpj,jpi,4))
ogstm_sedipi = huge(ogstm_sedipi(1,1,1,1))
allocate(ogstm_ph(jpk,jpj,jpi))
ogstm_ph = huge(ogstm_ph(1,1,1))
Expand All @@ -48,6 +52,12 @@ subroutine myalloc_BIO()
! and used in hard_tissue_pump.F also in land points
ice=0

allocate(sediPPY(jpi * jpj * jpk, 4))
allocate(local_D3DIAGNOS(jpi * jpj * jpk, jptra_dia))
allocate(local_D2DIAGNOS(jpi * jpj, jptra_dia_2d))
allocate(er(jpi * jpj * jpk, 11))
!$acc enter data create(ogstm_co2,ogstm_sedipi,ogstm_ph,sediPPY,local_D3DIAGNOS,local_D2DIAGNOS,er)

#ifdef Mem_Monitor
mem_all=get_mem(err) - aux_mem
#endif
Expand All @@ -63,6 +73,11 @@ subroutine clean_memory_bio()
deallocate(ogstm_sedipi)
deallocate(ogstm_ph)
deallocate(NPPF2)
deallocate(sediPPY)
deallocate(local_D3DIAGNOS)
deallocate(local_D2DIAGNOS)
deallocate(er)
!$acc exit data delete(ogstm_co2,ogstm_sedipi,ogstm_ph,sediPPY,local_D3DIAGNOS,local_D2DIAGNOS,er)

end subroutine clean_memory_bio

Expand Down
8 changes: 7 additions & 1 deletion src/BIO/FN_mem.f90
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,11 @@ subroutine myalloc_FN()

allocate(jarr_snu(2, jpi*jpj))
jarr_snu = huge(jarr_snu(1,1))
allocate(tra_FN(jpk,jpj,jpi,jptra))
allocate(tra_FN(jpk,jpj,jpi,jptra))
!$acc enter data create(tra_FN)
!$acc kernels default(present)
tra_FN = huge(tra_FN(1,1,1,1))
!$acc end kernels

!CALL OPA_elements(elements,nelements,idx_element)

Expand All @@ -66,7 +69,9 @@ subroutine myalloc_FN()
FN_ranking = huge(FN_ranking(1))


!$acc kernels default(present)
tra_FN=0.
!$acc end kernels

! cor_FN=0.
FN_ranking=0.
Expand All @@ -81,6 +86,7 @@ END subroutine myalloc_FN

subroutine clean_memory_fn

!$acc exit data delete(tra_FN)
deallocate(jarr_snu)
deallocate(tra_FN)
deallocate(TOTcalc)
Expand Down
33 changes: 20 additions & 13 deletions src/BIO/OPT_mem.f90
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@ MODULE OPT_mem


INTEGER, allocatable :: itabe(:),imaske(:,:)
double precision, allocatable :: zpar(:,:),xEPS_ogstm(:,:)
double precision, allocatable :: zpar0m(:),zpar100(:)
! double precision, allocatable :: zpar(:,:)
double precision, allocatable :: xEPS_ogstm(:,:)
! double precision, allocatable :: zpar0m(:),zpar100(:)
double precision, allocatable :: kef(:,:)
double precision, allocatable :: kextIO(:,:,:)
real, allocatable :: zkef_f (:,:)
Expand All @@ -40,17 +41,21 @@ subroutine myalloc_OPT()
allocate(imaske(jpk,jpi))
imaske = huge(imaske(1,1))
!!!$omp parallel default (none) shared(jpk,jpi)
allocate(zpar(jpk,jpi))
zpar = huge(zpar(1,1))
allocate(xEPS_ogstm(jpk,jpi))
! allocate(zpar(jpk,jpi))
! zpar = huge(zpar(1,1))
allocate(xEPS_ogstm(jpk,jpi))
!$acc enter data create(xEPS_ogstm)
!$acc kernels default(present)
xEPS_ogstm = huge(xEPS_ogstm(1,1))
allocate(zpar0m(jpi))
zpar0m = huge(zpar0m(1))
allocate(zpar100(jpi))
zpar100 = huge(zpar100(1))
!$acc end kernels
! allocate(zpar0m(jpi))
! zpar0m = huge(zpar0m(1))
! allocate(zpar100(jpi))
! zpar100 = huge(zpar100(1))
!!!$omp end parallel

allocate(kef(jpj,jpi))
allocate(kef(jpj,jpi))
!$acc enter data create(kef)
kef = huge(kef(1,1))
allocate(kextIO(jpj,jpi,2))
kextIO = huge(kextIO(1,1,1))
Expand All @@ -71,10 +76,12 @@ subroutine clean_memory_opt

deallocate(itabe)
deallocate(imaske)
deallocate(zpar)
! deallocate(zpar)
!$acc exit data delete(xEPS_ogstm)
deallocate(xEPS_ogstm)
deallocate(zpar0m)
deallocate(zpar100)
! deallocate(zpar0m)
! deallocate(zpar100)
!$acc exit data delete(kef)
deallocate(kef)
deallocate(kextIO)

Expand Down
26 changes: 17 additions & 9 deletions src/BIO/SED_mem.f90
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ subroutine myalloc_SED()
#endif
dimen_jvsed=0

allocate(sed_idx(nsed))
allocate(sed_idx(nsed))
sed_idx = huge(sed_idx(1))

sed_idx(1) = ppR6c
Expand Down Expand Up @@ -78,23 +78,31 @@ subroutine myalloc_SED()
allocate(jarr_sed(2, jpi*jpj))
jarr_sed = huge(jarr_sed(1,1))
allocate(jarr_sed_flx(jpk,jpi*jpj))
jarr_sed_flx = huge(jarr_sed_flx(1,1))
allocate( ztra(nsed,ntids))
ztra = huge(ztra(1,1))
allocate(zwork(jpk,nsed, ntids))
zwork = huge(zwork(1,1,1))
jarr_sed_flx = huge(jarr_sed_flx(1,1))
!$acc enter data create(sed_idx,jarr_sed,jarr_sed_flx)


#ifdef Mem_Monitor
mem_all=get_mem(err) - aux_mem
#endif

END subroutine myalloc_SED




subroutine myalloc_SED_ztra_zwork()

allocate(ztra(nsed,dimen_jvsed))
allocate(zwork(jpk,nsed,dimen_jvsed))
!$acc enter data create(ztra,zwork)
!$acc kernels default(present)
ztra = huge(ztra(1,1))
zwork = huge(zwork(1,1,1))
!$acc end kernels

end subroutine myalloc_SED_ztra_zwork

subroutine clean_memory_sed

!$acc exit data delete(ztra,zwork,sed_idx,jarr_sed,jarr_sed_flx)
deallocate(sed_idx)
deallocate(jarr_sed)
deallocate(jarr_sed_flx)
Expand Down
Loading