Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

trcsms GPU port #6

Open
wants to merge 14 commits into
base: dev_gpu
Choose a base branch
from
17 changes: 16 additions & 1 deletion src/BIO/BIO_mem.f90
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ MODULE BIO_mem
double precision, allocatable :: ogstm_ph(:,:,:) ! GUESS for FOLLOWS algorithm
double precision, allocatable :: NPPF2(:,:,:)
double precision, allocatable :: ogstm_co2(:,:), co2_IO(:,:,:)
double precision, allocatable :: sediPPY(:,:)
double precision, allocatable :: local_D3DIAGNOS(:,:)
double precision, allocatable :: local_D2DIAGNOS(:,:)
double precision, allocatable :: er(:,:)
double precision:: ice


Expand All @@ -38,7 +42,7 @@ subroutine myalloc_BIO()
allocate(co2_IO(jpj,jpi,2))

co2_IO = huge(co2_IO(1,1,1))
allocate(ogstm_sedipi(jpk,jpj,jpi,4))
allocate(ogstm_sedipi(jpk,jpj,jpi,4))
ogstm_sedipi = huge(ogstm_sedipi(1,1,1,1))
allocate(ogstm_ph(jpk,jpj,jpi))
ogstm_ph = huge(ogstm_ph(1,1,1))
Expand All @@ -48,6 +52,12 @@ subroutine myalloc_BIO()
! and used in hard_tissue_pump.F also in land points
ice=0

allocate(sediPPY(jpi * jpj * jpk, 4))
allocate(local_D3DIAGNOS(jpi * jpj * jpk, jptra_dia))
allocate(local_D2DIAGNOS(jpi * jpj, jptra_dia_2d))
allocate(er(jpi * jpj * jpk, 11))
!$acc enter data create(ogstm_co2,ogstm_sedipi,ogstm_ph,sediPPY,local_D3DIAGNOS,local_D2DIAGNOS,er)

#ifdef Mem_Monitor
mem_all=get_mem(err) - aux_mem
#endif
Expand All @@ -63,6 +73,11 @@ subroutine clean_memory_bio()
deallocate(ogstm_sedipi)
deallocate(ogstm_ph)
deallocate(NPPF2)
deallocate(sediPPY)
deallocate(local_D3DIAGNOS)
deallocate(local_D2DIAGNOS)
deallocate(er)
!$acc exit data delete(ogstm_co2,ogstm_sedipi,ogstm_ph,sediPPY,local_D3DIAGNOS,local_D2DIAGNOS,er)

end subroutine clean_memory_bio

Expand Down
33 changes: 20 additions & 13 deletions src/BIO/OPT_mem.f90
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@ MODULE OPT_mem


INTEGER, allocatable :: itabe(:),imaske(:,:)
double precision, allocatable :: zpar(:,:),xEPS_ogstm(:,:)
double precision, allocatable :: zpar0m(:),zpar100(:)
! double precision, allocatable :: zpar(:,:)
double precision, allocatable :: xEPS_ogstm(:,:)
! double precision, allocatable :: zpar0m(:),zpar100(:)
double precision, allocatable :: kef(:,:)
double precision, allocatable :: kextIO(:,:,:)
real, allocatable :: zkef_f (:,:)
Expand All @@ -40,17 +41,21 @@ subroutine myalloc_OPT()
allocate(imaske(jpk,jpi))
imaske = huge(imaske(1,1))
!!!$omp parallel default (none) shared(jpk,jpi)
allocate(zpar(jpk,jpi))
zpar = huge(zpar(1,1))
allocate(xEPS_ogstm(jpk,jpi))
! allocate(zpar(jpk,jpi))
! zpar = huge(zpar(1,1))
allocate(xEPS_ogstm(jpk,jpi))
!$acc enter data create(xEPS_ogstm)
!$acc kernels default(present)
xEPS_ogstm = huge(xEPS_ogstm(1,1))
allocate(zpar0m(jpi))
zpar0m = huge(zpar0m(1))
allocate(zpar100(jpi))
zpar100 = huge(zpar100(1))
!$acc end kernels
! allocate(zpar0m(jpi))
! zpar0m = huge(zpar0m(1))
! allocate(zpar100(jpi))
! zpar100 = huge(zpar100(1))
!!!$omp end parallel

allocate(kef(jpj,jpi))
allocate(kef(jpj,jpi))
!$acc enter data create(kef)
kef = huge(kef(1,1))
allocate(kextIO(jpj,jpi,2))
kextIO = huge(kextIO(1,1,1))
Expand All @@ -71,10 +76,12 @@ subroutine clean_memory_opt

deallocate(itabe)
deallocate(imaske)
deallocate(zpar)
! deallocate(zpar)
!$acc exit data delete(xEPS_ogstm)
deallocate(xEPS_ogstm)
deallocate(zpar0m)
deallocate(zpar100)
! deallocate(zpar0m)
! deallocate(zpar100)
!$acc exit data delete(kef)
deallocate(kef)
deallocate(kextIO)

Expand Down
22 changes: 20 additions & 2 deletions src/BIO/SED_mem.f90
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ subroutine myalloc_SED()
#endif
dimen_jvsed=0

allocate(sed_idx(nsed))
allocate(sed_idx(nsed))
sed_idx = huge(sed_idx(1))

sed_idx(1) = ppR6c
Expand Down Expand Up @@ -79,22 +79,40 @@ subroutine myalloc_SED()
jarr_sed = huge(jarr_sed(1,1))
allocate(jarr_sed_flx(jpk,jpi*jpj))
jarr_sed_flx = huge(jarr_sed_flx(1,1))
allocate( ztra(nsed,ntids))
#ifndef _OPENACC
allocate( ztra(nsed,ntids))
ztra = huge(ztra(1,1))
allocate(zwork(jpk,nsed, ntids))
zwork = huge(zwork(1,1,1))
#endif
!$acc enter data create(sed_idx,jarr_sed,jarr_sed_flx)


#ifdef Mem_Monitor
mem_all=get_mem(err) - aux_mem
#endif

END subroutine myalloc_SED

#ifdef _OPENACC
subroutine myalloc_SED_gpu()

allocate(ztra(nsed,dimen_jvsed))
allocate(zwork(jpk,nsed,dimen_jvsed))
!$acc enter data create(ztra,zwork)
!$acc kernels default(present)
ztra = huge(ztra(1,1))
zwork = huge(zwork(1,1,1))
!$acc end kernels

end subroutine myalloc_SED_gpu
#endif



subroutine clean_memory_sed

!$acc exit data delete(ztra,zwork,sed_idx,jarr_sed,jarr_sed_flx)
deallocate(sed_idx)
deallocate(jarr_sed)
deallocate(jarr_sed_flx)
Expand Down
174 changes: 97 additions & 77 deletions src/BIO/trcbio.f90
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,7 @@ SUBROUTINE trcbio

IMPLICIT NONE

double precision, dimension(jpi * jpj * jpk, 4) :: sediPPY
double precision, dimension(jpi * jpj * jpk, jptra_dia) :: local_D3DIAGNOS
double precision, dimension(jpi * jpj, jptra_dia_2d) :: local_D2DIAGNOS
double precision, dimension(jpi * jpj * jpk, 11) :: er

integer :: jk, jj, ji, jn, jlinear2d, jlinear3d, bottom
integer :: jk, jj, ji, jn, jlinear2d, jlinear3d, bottom, queue
double precision :: correct_fact, gdept_local, gdeptmax_local

integer :: year, month, day
Expand All @@ -62,6 +57,10 @@ SUBROUTINE trcbio
BIOparttime = MPI_WTIME()

! Initialization

queue=1

!$acc kernels default(present) async(queue)
D3STATE = 1.0
er = 1.0
er(:,10) = 8.1
Expand All @@ -70,120 +69,141 @@ SUBROUTINE trcbio

! ogstm_sediPI appear to be unused
ogstm_sediPI = 0.
!$acc end kernels

! NOTE: this kernel *needs* to be executed synchronously as we need the
! reduced `bottom` scalar value on host before launching the next kernel.
bottom=0
!$acc parallel loop gang vector reduction(max:bottom) collapse(2) default(present)
do ji = 1, jpi
do jj = 1, jpj
bottom = max(bottom,mbathy(jj,ji))
end do
end do
!$acc end parallel loop

! Set D3STATE (pass state to BFM)
!$acc parallel loop gang vector default(present) collapse(4) async(queue)
do jn = 1, jptra
do ji = 1, jpi
do jj = 1, jpj
if (bfmmask(1, jj, ji) == 0) then
cycle
else
bottom = mbathy(jj, ji)
do jk = 1, bottom
jlinear3d = jk + (jj - 1) * jpk + (ji - 1) * jpk * jpj
D3STATE(jlinear3d, jn) = trn(jk, jj, ji, jn)
do jk = 1, bottom
if (.not. bfmmask(1, jj, ji) == 0) then
if (jk <= mbathy(jj, ji)) then
jlinear3d = jk + (jj - 1) * jpk + (ji - 1) * jpk * jpj
D3STATE(jlinear3d, jn) = trn(jk, jj, ji, jn)
endif
endif
end do
end if
end do
end do
end do
!$acc end parallel loop

call read_date_string(COMMON_DATEstring, year, month, day, sec)

! Set er
! Set er
!$acc parallel loop gang vector default(present) collapse(3) async(queue)
do ji = 1, jpi
do jj = 1, jpj
if (bfmmask(1, jj, ji) == 0) then
cycle
else
bottom = mbathy(jj, ji)
do jj = 1, jpj
do jk = 1, bottom
jlinear3d = jk + (jj - 1) * jpk + (ji - 1) * jpk * jpj
if (jk .eq. 1) then
er(jlinear3d, 4) = ice ! from 0 to 1 adimensional
er(jlinear3d, 5) = ogstm_co2(jj, ji) ! CO2 Mixing Ratios (ppm) 390
er(jlinear3d, 7) = DAY_LENGTH(jj, ji) ! fotoperiod expressed in hours
er(jlinear3d, 9) = vatm(jj, ji) ! wind speed (m/s)
end if
er(jlinear3d, 1) = tn(jk, jj, ji) ! Temperature (Celsius)
er(jlinear3d, 2) = sn(jk, jj, ji) ! Salinity PSU
er(jlinear3d, 3) = rho(jk, jj, ji) ! Density Kg/m3
er(jlinear3d, 6) = instant_par_from_sec(sec, xpar(jk, jj, ji)) ! PAR umoles/m2/s | Watt to umoles photons W2E=1./0.217
er(jlinear3d, 8) = e3t(jk, jj, ji) ! depth in meters of the given cell
er(jlinear3d, 10) = ogstm_PH(jk, jj, ji) ! 8.1
if (.not. bfmmask(1, jj, ji) == 0) then
if (jk <= mbathy(jj, ji)) then
jlinear3d = jk + (jj - 1) * jpk + (ji - 1) * jpk * jpj
if (jk .eq. 1) then
er(jlinear3d, 4) = ice ! from 0 to 1 adimensional
er(jlinear3d, 5) = ogstm_co2(jj, ji) ! CO2 Mixing Ratios (ppm) 390
er(jlinear3d, 7) = DAY_LENGTH(jj, ji) ! fotoperiod expressed in hours
er(jlinear3d, 9) = vatm(jj, ji) ! wind speed (m/s)
end if
er(jlinear3d, 1) = tn(jk, jj, ji) ! Temperature (Celsius)
er(jlinear3d, 2) = sn(jk, jj, ji) ! Salinity PSU
er(jlinear3d, 3) = rho(jk, jj, ji) ! Density Kg/m3
er(jlinear3d, 6) = instant_par_from_sec(sec, xpar(jk, jj, ji)) ! PAR umoles/m2/s | Watt to umoles photons W2E=1./0.217
er(jlinear3d, 8) = e3t(jk, jj, ji) ! depth in meters of the given cell
er(jlinear3d, 10) = ogstm_PH(jk, jj, ji) ! 8.1
#ifdef gdept1d
gdept_local = gdept(jk)
gdeptmax_local = gdept(jpk)
gdept_local = gdept(jk)
gdeptmax_local = gdept(jpk)
#else
gdept_local = gdept(jk, jj, ji)
gdeptmax_local = gdept(jpk, jj, ji)
gdept_local = gdept(jk, jj, ji)
gdeptmax_local = gdept(jpk, jj, ji)
#endif
if (gdept_local .lt. 1000.0D0) then
correct_fact = 1.0D0
else if (gdept_local .lt. 2000.0D0) then
correct_fact = 0.25D0
else
correct_fact = 0.0D0
end if
er(jlinear3d, 11) = correct_fact * (gdeptmax_local - gdept_local) / gdept_local
if (gdept_local .lt. 1000.0D0) then
correct_fact = 1.0D0
else if (gdept_local .lt. 2000.0D0) then
correct_fact = 0.25D0
else
correct_fact = 0.0D0
end if
er(jlinear3d, 11) = correct_fact * (gdeptmax_local - gdept_local) / gdept_local
endif
end if
end do
end if
end do
end do
end do

!$acc end parallel loop

!$acc wait(queue)

call BFM1D_Input_EcologyDynamics(mbathy, er) ! here mbathy was bottom
call BFM1D_reset()
call EcologyDynamics()
call BFM1D_Output_EcologyDynamics(sediPPY, local_D3DIAGNOS, local_D2DIAGNOS)

! The following copies could be avoided
!$acc parallel loop gang vector collapse(4) default(present) async(queue)
do jn = 1, max(4, jptra, jptra_dia)
do ji = 1, jpi
do jj = 1, jpj
if (bfmmask(1, jj, ji) == 0) then
cycle
else
bottom = mbathy(jj, ji)
do jk = 1, bottom
jlinear3d = jk + (jj - 1) * jpk + (ji - 1) * jpk * jpj
if (jn .le. jptra) then
tra(jk, jj, ji, jn) = tra(jk, jj, ji, jn) + D3SOURCE(jlinear3d, jn) ! trend
end if
if (jn .le. jptra_dia) then
tra_DIA(jk, jj, ji, jn) = local_D3DIAGNOS(jlinear3d, jn)
end if
if (jn .le. 4) then
ogstm_sediPI(jk, jj, ji, jn) = sediPPY(jlinear3d, jn) ! BFM output of sedimentation speed (m/d)
do jk = 1, bottom
if (.not. bfmmask(1, jj, ji) == 0) then
if (jk <= mbathy(jj, ji)) then
jlinear3d = jk + (jj - 1) * jpk + (ji - 1) * jpk * jpj
if (jn .le. jptra) then
tra(jk, jj, ji, jn) = tra(jk, jj, ji, jn) + D3SOURCE(jlinear3d, jn) ! trend
endif
if (jn .le. jptra_dia) then
tra_DIA(jk, jj, ji, jn) = local_D3DIAGNOS(jlinear3d, jn)
endif
if (jn .le. 4) then
ogstm_sediPI(jk, jj, ji, jn) = sediPPY(jlinear3d, jn) ! BFM output of sedimentation speed (m/d)
endif
end if
end do
end if
end if
end do
end do
end do
end do

!$acc end parallel loop

!$acc parallel loop gang vector collapse(3) default(present) async(queue)
do ji = 1, jpi
do jj = 1, jpj
if (bfmmask(1, jj, ji) == 0) then
cycle
else
bottom = mbathy(jj, ji)
jlinear2d = jj + (ji - 1) * jpj
tra_DIA_2d(:, jj, ji) = local_D2DIAGNOS(jlinear2d, :)
do jk = 1, bottom
jlinear3d = jk + (jj - 1) * jpk + (ji - 1) * jpk * jpj
ogstm_PH(jk, jj, ji) = local_D3DIAGNOS(jlinear3d, pppH) ! Follows solver guess, put 8.0 if pppH is not defined
end do
end if
do jk = 1, bottom
if (.not. bfmmask(1, jj, ji) == 0) then
if (jk <= mbathy(jj, ji)) then
jlinear2d = jj + (ji - 1) * jpj
tra_DIA_2d(:, jj, ji) = local_D2DIAGNOS(jlinear2d, :)
jlinear3d = jk + (jj - 1) * jpk + (ji - 1) * jpk * jpj
ogstm_PH(jk, jj, ji) = local_D3DIAGNOS(jlinear3d, pppH) ! Follows solver guess, put 8.0 if pppH is not defined
end if
end if
end do
end do
end do
!$acc end parallel loop

!---------------------------------------------------------------------
! BEGIN BC_REFACTORING SECTION
!---------------------------------------------------------------------
! XXX: when should we care about this ?
call boundaries%fix_diagnostic_vars()
!----------------------------------------------------------------------
! END BC_REFACTORING SECTION
!---------------------------------------------------------------------

!$acc wait(queue)

BIOparttime = MPI_WTIME() - BIOparttime
BIOtottime = BIOtottime + BIOparttime
END SUBROUTINE trcbio
Loading