From 47ac7fa6fca4dc35e0de0853140ce7e30faca0bb Mon Sep 17 00:00:00 2001 From: Fredrik Jansson <fjansson@abo.fi> Date: Thu, 8 Jun 2023 14:40:16 +0200 Subject: [PATCH 01/19] modforces: forces and lstend cleanup and optimization i,j,n loops as vector expressions. Avoids short inner loop over n. Remove data dependencies for better automatic OpenMP parallelization with Fujitsu compiler. In lstend, merge k=1 and k>1 cases. --- src/modforces.f90 | 154 +++++++++++++--------------------------------- 1 file changed, 43 insertions(+), 111 deletions(-) diff --git a/src/modforces.f90 b/src/modforces.f90 index 93d33f4e0..09bbe3226 100644 --- a/src/modforces.f90 +++ b/src/modforces.f90 @@ -63,69 +63,36 @@ subroutine forces use modglobal, only : i1,j1,kmax,dzh,dzf,grav, lpressgrad use modfields, only : sv0,up,vp,wp,thv0h,dpdxl,dpdyl,thvh use moduser, only : force_user - use modmicrodata, only : imicro, imicro_bulk, imicro_bin, imicro_sice,iqr + use modmicrodata, only : imicro, imicro_bulk, imicro_bin, imicro_sice, imicro_sice2, iqr implicit none - integer i, j, k, jm, jp, km, kp + integer i, j, k if (lforce_user) call force_user - if((imicro==imicro_sice).or.(imicro==imicro_bulk).or.(imicro==imicro_bin)) then + if (lpressgrad) then + do k=1,kmax + up(:,:,k) = up(:,:,k) - dpdxl(k) !RN LS pressure gradient force in x,y directions; + vp(:,:,k) = vp(:,:,k) - dpdyl(k) + end do + end if + + if((imicro==imicro_sice).or.(imicro==imicro_sice2).or.(imicro==imicro_bulk).or.(imicro==imicro_bin)) then do k=2,kmax - kp=k+1 - km=k-1 - do j=2,j1 - jp=j+1 - jm=j-1 - do i=2,i1 - - if (lpressgrad) then - up(i,j,k) = up(i,j,k) - dpdxl(k) !RN LS pressure gradient force in x,y directions; - vp(i,j,k) = vp(i,j,k) - dpdyl(k) - end if - wp(i,j,k) = wp(i,j,k) + grav*(thv0h(i,j,k)-thvh(k))/thvh(k) - & - grav*(sv0(i,j,k,iqr)*dzf(k-1)+sv0(i,j,k-1,iqr)*dzf(k))/(2.0*dzh(k)) - end do - end do + wp(:,:,k) = wp(:,:,k) + grav*(thv0h(:,:,k)-thvh(k))/thvh(k) - & + grav*(sv0(:,:,k,iqr)*dzf(k-1)+sv0(:,:,k-1,iqr)*dzf(k))/(2.0*dzh(k)) end do else do k=2,kmax - kp=k+1 - km=k-1 - do j=2,j1 - jp=j+1 - jm=j-1 - do i=2,i1 - up(i,j,k) = up(i,j,k) - dpdxl(k) - vp(i,j,k) = vp(i,j,k) - dpdyl(k) - wp(i,j,k) = wp(i,j,k) + grav*(thv0h(i,j,k)-thvh(k))/thvh(k) - end do - end do + wp(:,:,k) = wp(:,:,k) + grav*(thv0h(:,:,k)-thvh(k))/thvh(k) end do end if ! -------------------------------------------- ! special treatment for lowest full level: k=1 ! -------------------------------------------- + wp(:,:,1) = 0.0 - do j=2,j1 - jp = j+1 - jm = j-1 - do i=2,i1 - - if (lpressgrad) then - up(i,j,1) = up(i,j,1) - dpdxl(1) - vp(i,j,1) = vp(i,j,1) - dpdyl(1) - end if - - wp(i,j,1) = 0.0 - - end do - end do -! ----------------------------------------------end i,j-loop - - - return end subroutine forces subroutine coriolis @@ -152,7 +119,7 @@ subroutine coriolis integer i, j, k, jm, jp, km, kp if (lcoriol .eqv. .false.) return - + do k=2,kmax kp=k+1 km=k-1 @@ -232,80 +199,45 @@ subroutine lstend dqtdtls, dthldtls, dudtls, dvdtls implicit none - integer i,j,k,n,kp,km - real subs_thl,subs_qt,subs_sv,subs_u,subs_v + integer k,kp,km ! 1. DETERMINE LARGE SCALE TENDENCIES ! -------------------------------- ! 1.1 lowest model level above surface : only downward component - - subs_thl = 0. - subs_qt = 0. - subs_sv = 0. - subs_u = 0. - subs_v = 0. - - do j=2,j1 - do i=2,i1 - k = 1 - if (whls(2).lt.0) then !neglect effect of mean ascending on tendencies at the lowest full level - subs_thl = 0.5*whls(2) *(thl0(i,j,2)-thl0(i,j,1))/dzh(2) - subs_qt = 0.5*whls(2) *(qt0(i,j,2)-qt0(i,j,1) )/dzh(2) - if (lmomsubs) then - subs_u = 0.5*whls(2) *(u0(i,j,2)-u0(i,j,1))/dzh(2) - subs_v = 0.5*whls(2) *(v0(i,j,2)-v0(i,j,1))/dzh(2) - endif - do n=1,nsv - subs_sv = 0.5*whls(2) *(sv0(i,j,2,n)-sv0(i,j,1,n) )/dzh(2) - svp(i,j,1,n) = svp(i,j,1,n)-subs_sv - enddo - endif - thlp(i,j,1) = thlp(i,j,1) -u0av(1)*dthldxls(1)-v0av(1)*dthldyls(1)-subs_thl + dthldtls(1) - qtp(i,j,1) = qtp (i,j,1) -u0av(1)*dqtdxls (1)-v0av(1)*dqtdyls (1)-subs_qt + dqtdtls(1) - up (i,j,1) = up (i,j,1) -u0av(1)*dudxls (1)-v0av(1)*dudyls (1)-subs_u + dudtls(1) - vp (i,j,1) = vp (i,j,1) -u0av(1)*dvdxls (1)-v0av(1)*dvdyls (1)-subs_v + dvdtls(1) - end do - end do - ! 1.2 other model levels twostream - do k=2,kmax + do k=1,kmax kp=k+1 km=k-1 - do j=2,j1 - do i=2,i1 - if (whls(kp).lt.0) then !downwind scheme for subsidence - subs_thl = whls(kp) * (thl0(i,j,kp) - thl0(i,j,k))/dzh(kp) - subs_qt = whls(kp) * (qt0 (i,j,kp) - qt0 (i,j,k))/dzh(kp) - if (lmomsubs) then - subs_u = whls(kp) * (u0(i,j,kp) - u0(i,j,k))/dzh(kp) - subs_v = whls(kp) * (v0(i,j,kp) - v0(i,j,k))/dzh(kp) - endif - do n=1,nsv - subs_sv = whls(kp) *(sv0(i,j,kp,n) - sv0(i,j,k,n))/dzh(kp) - svp(i,j,k,n) = svp(i,j,k,n)-subs_sv - enddo - else !downwind scheme for mean upward motions - subs_thl = whls(k) * (thl0(i,j,k) - thl0(i,j,km))/dzh(k) - subs_qt = whls(k) * (qt0 (i,j,k) - qt0 (i,j,km))/dzh(k) + + if (whls(kp).lt.0) then !downwind scheme for subsidence + thlp(2:i1,2:j1,k) = thlp(2:i1,2:j1,k) - whls(kp) * (thl0(2:i1,2:j1,kp) - thl0(2:i1,2:j1,k))/dzh(kp) + qtp (2:i1,2:j1,k) = qtp (2:i1,2:j1,k) - whls(kp) * (qt0 (2:i1,2:j1,kp) - qt0 (2:i1,2:j1,k))/dzh(kp) + if (lmomsubs) then + up(2:i1,2:j1,k) = up(2:i1,2:j1,k) - whls(kp) * (u0(2:i1,2:j1,kp) - u0(2:i1,2:j1,k))/dzh(kp) + vp(2:i1,2:j1,k) = vp(2:i1,2:j1,k) - whls(kp) * (v0(2:i1,2:j1,kp) - v0(2:i1,2:j1,k))/dzh(kp) + endif + + svp(2:i1,2:j1,k,:) = svp(2:i1,2:j1,k,:) - whls(kp) * (sv0(2:i1,2:j1,kp,:) - sv0(2:i1,2:j1,k,:))/dzh(kp) + + else !downwind scheme for mean upward motions + if (k > 1) then !neglect effect of mean ascending on tendencies at the lowest full level + thlp(2:i1,2:j1,k) = thlp(2:i1,2:j1,k) - whls(k) * (thl0(2:i1,2:j1,k) - thl0(2:i1,2:j1,km))/dzh(k) + qtp (2:i1,2:j1,k) = qtp (2:i1,2:j1,k) - whls(k) * (qt0 (2:i1,2:j1,k) - qt0 (2:i1,2:j1,km))/dzh(k) if (lmomsubs) then - subs_u = whls(k) * (u0(i,j,k) - u0(i,j,km))/dzh(k) - subs_v = whls(k) * (v0(i,j,k) - v0(i,j,km))/dzh(k) + up(2:i1,2:j1,k) = up(2:i1,2:j1,k) - whls(k) * (u0(2:i1,2:j1,k) - u0(2:i1,2:j1,km))/dzh(k) + vp(2:i1,2:j1,k) = vp(2:i1,2:j1,k) - whls(k) * (v0(2:i1,2:j1,k) - v0(2:i1,2:j1,km))/dzh(k) endif - do n=1,nsv - subs_sv = whls(k) * (sv0(i,j,k,n) - sv0(i,j,km,n))/dzh(k) - svp(i,j,k,n) = svp(i,j,k,n)-subs_sv - enddo - endif - - thlp(i,j,k) = thlp(i,j,k)-u0av(k)*dthldxls(k)-v0av(k)*dthldyls(k)-subs_thl + dthldtls(k) - qtp (i,j,k) = qtp (i,j,k)-u0av(k)*dqtdxls (k)-v0av(k)*dqtdyls (k)-subs_qt + dqtdtls(k) - up (i,j,k) = up (i,j,k)-u0av(k)*dudxls (k)-v0av(k)*dudyls (k)-subs_u + dudtls(k) - vp (i,j,k) = vp (i,j,k)-u0av(k)*dvdxls (k)-v0av(k)*dvdyls (k)-subs_v + dvdtls(k) - - enddo - enddo + svp(2:i1,2:j1,k,:) = svp(2:i1,2:j1,k,:)-whls(k) * (sv0(2:i1,2:j1,k,:) - sv0(2:i1,2:j1,km,:))/dzh(k) + endif + endif + + thlp(2:i1,2:j1,k) = thlp(2:i1,2:j1,k)-u0av(k)*dthldxls(k)-v0av(k)*dthldyls(k) + dthldtls(k) + qtp (2:i1,2:j1,k) = qtp (2:i1,2:j1,k)-u0av(k)*dqtdxls (k)-v0av(k)*dqtdyls (k) + dqtdtls(k) + up (2:i1,2:j1,k) = up (2:i1,2:j1,k)-u0av(k)*dudxls (k)-v0av(k)*dudyls (k) + dudtls(k) + vp (2:i1,2:j1,k) = vp (2:i1,2:j1,k)-u0av(k)*dvdxls (k)-v0av(k)*dvdyls (k) + dvdtls(k) + enddo return From 3fe0063101f57f0a66283cd951f72a5aa1ca0270 Mon Sep 17 00:00:00 2001 From: Fredrik Jansson <fjansson@abo.fi> Date: Tue, 17 Jan 2023 14:29:18 +0100 Subject: [PATCH 02/19] Put sgs_surface_fix back in NAMSUBGRID namelist The fix can still be required to avoid too high TKE production at the surface. The default is still false. --- src/modsubgrid.f90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/modsubgrid.f90 b/src/modsubgrid.f90 index ab5598061..a407ba5d1 100644 --- a/src/modsubgrid.f90 +++ b/src/modsubgrid.f90 @@ -107,7 +107,7 @@ subroutine subgridnamelist integer :: ierr namelist/NAMSUBGRID/ & - ldelta,lmason,cf,cn,Rigc,Prandtl,lsmagorinsky,cs,nmason,ch1 + ldelta,lmason,cf,cn,Rigc,Prandtl,lsmagorinsky,cs,nmason,sgs_surface_fix,ch1 if(myid==0)then open(ifnamopt,file=fname_options,status='old',iostat=ierr) From 450422e835add2ab7a49f5e613b8a13e1fba8b76 Mon Sep 17 00:00:00 2001 From: Fredrik Jansson <fjansson@abo.fi> Date: Wed, 7 Jun 2023 15:17:18 +0200 Subject: [PATCH 03/19] modsubgrid: simplify subgrid TKE scheme and add anisotropic diffusion option Move subgrid scheme selection out of for loops for clarity and better vectorization. To enable anisotropic diffusion, set lanisotrop = .true. in NAMSUBGRID namelist. Anisotropic diffusion implemented by Stephan de Roode. The anisotropic scheme and the effects of using it are documented here: de Roode, S. R., Siebesma, A. P., Jansson, F., & Janssens, M. Dependency of mesoscale organization on grid anisotropy in large-eddy simulations of convective boundary layers at Gray Zone resolutions. JAMES, 14, e2022MS003095 (2022) https://doi.org/10.1029/2022MS003095 --- src/modbudget.f90 | 22 +++--- src/modglobal.f90 | 4 +- src/modsubgrid.f90 | 169 ++++++++++++++++++++++++++++------------- src/modsubgriddata.f90 | 4 + 4 files changed, 133 insertions(+), 66 deletions(-) diff --git a/src/modbudget.f90 b/src/modbudget.f90 index b815ea244..1a0fa194f 100644 --- a/src/modbudget.f90 +++ b/src/modbudget.f90 @@ -213,7 +213,7 @@ subroutine do_genbudget ijtot,cu,cv,grav, & dxi,dyi,dx2i,dy2i use modsurfdata,only : ustar - use modsubgriddata, only : ekm + use modsubgriddata, only : ekm,anis_fac use modpois, only : p use modfields, only : u0,v0,w0,thv0h,u0av,v0av,rhobf,rhobh,thvh !cstep use modtilt, only : adjustbudget,ltilted @@ -407,12 +407,12 @@ subroutine do_genbudget tau1j(i,j,k) = & ( ekm(i,j,k) * (u0(i+1,j,k)-u0(i,j,k)) & - -ekm(i-1,j,k)* (u0(i,j,k)-u0(i-1,j,k)) ) * 2. * dx2i & + -ekm(i-1,j,k)* (u0(i,j,k)-u0(i-1,j,k)) ) * 2. * dx2i * anis_fac(k) & + & ( empo * ( (u0(i,jp,k)-u0(i,j,k)) *dyi & +(v0(i,jp,k)-v0(i-1,jp,k))*dxi) & -emmo * ( (u0(i,j,k)-u0(i,jm,k)) *dyi & - +(v0(i,j,k)-v0(i-1,j,k)) *dxi) ) * dyi & + +(v0(i,j,k)-v0(i-1,j,k)) *dxi) ) * dyi * anis_fac(k) & + & ( rhobh(kp)/rhobf(k) * emop * ( (u0(i,j,kp)-u0(i,j,k)) /dzh(kp) & +(w0(i,j,kp)-w0(i-1,j,kp))*dxi) & @@ -455,10 +455,10 @@ subroutine do_genbudget ( epmo * ( (v0(i+1,j,k)-v0(i,j,k)) *dxi & +(u0(i+1,j,k)-u0(i+1,jm,k))*dyi) & -emmo * ( (v0(i,j,k)-v0(i-1,j,k)) *dxi & - +(u0(i,j,k)-u0(i,jm,k)) *dyi) ) * dxi & + +(u0(i,j,k)-u0(i,jm,k)) *dyi) ) * dxi * anis_fac(k) & + & ( ekm(i,j,k) * (v0(i,jp,k)-v0(i,j,k)) & - -ekm(i,jm,k)* (v0(i,j,k)-v0(i,jm,k)) ) * 2. * dy2i & + -ekm(i,jm,k)* (v0(i,j,k)-v0(i,jm,k)) ) * 2. * dy2i * anis_fac(k) & + & ( rhobh(kp)/rhobf(k) * eomp * ( (v0(i,j,kp)-v0(i,j,k)) /dzh(kp) & +(w0(i,j,kp)-w0(i,jm,kp)) *dyi) & @@ -501,12 +501,12 @@ subroutine do_genbudget ( epom * ( (w0(i+1,j,k)-w0(i,j,k)) *dxi & +(u0(i+1,j,k)-u0(i+1,j,km)) /dzh(k) ) & -emom * ( (w0(i,j,k)-w0(i-1,j,k)) *dxi & - +(u0(i,j,k)-u0(i,j,km)) /dzh(k) ))*dxi & + +(u0(i,j,k)-u0(i,j,km)) /dzh(k) ))*dxi * anis_fac(k) & + & ( eopm * ( (w0(i,jp,k)-w0(i,j,k)) *dyi & +(v0(i,jp,k)-v0(i,jp,km)) /dzh(k) ) & -eomm * ( (w0(i,j,k)-w0(i,jm,k)) *dyi & - +(v0(i,j,k)-v0(i,j,km)) /dzh(k) ))*dyi & + +(v0(i,j,k)-v0(i,j,km)) /dzh(k) ))*dyi * anis_fac(k) & + & ( rhobf(k)/rhobh(k) * ekm(i,j,k) * (w0(i,j,kp)-w0(i,j,k)) /dzf(k) & - rhobf(km)/rhobh(k) * ekm(i,j,km)* (w0(i,j,k)-w0(i,j,km)) /dzf(km) ) * 2. & @@ -553,12 +553,12 @@ subroutine do_genbudget tau1j(i,j,1) = & ( ekm(i,j,1) * (u0(i+1,j,1)-u0(i,j,1)) & - -ekm(i-1,j,1)* (u0(i,j,1)-u0(i-1,j,1)) ) * 2. * dx2i & + -ekm(i-1,j,1)* (u0(i,j,1)-u0(i-1,j,1)) ) * 2. * dx2i * anis_fac(k) & + & ( empo * ( (u0(i,jp,1)-u0(i,j,1)) *dyi & +(v0(i,jp,1)-v0(i-1,jp,1))*dxi) & - emmo * ( (u0(i,j,1)-u0(i,jm,1)) *dyi & - +(v0(i,j,1)-v0(i-1,j,1)) *dxi) ) * dyi & + +(v0(i,j,1)-v0(i-1,j,1)) *dxi) ) * dyi * anis_fac(k) & + & ( rhobh(2)/rhobf(1) * emop * ( (u0(i,j,2)-u0(i,j,1)) /dzh(2) & +(w0(i,j,2)-w0(i-1,j,2)) *dxi) & @@ -584,10 +584,10 @@ subroutine do_genbudget ( epmo * ( (v0(i+1,j,1)-v0(i,j,1)) *dxi & +(u0(i+1,j,1)-u0(i+1,jm,1))*dyi) & -emmo * ( (v0(i,j,1)-v0(i-1,j,1)) *dxi & - +(u0(i,j,1)-u0(i,jm,1)) *dyi) ) * dxi & + +(u0(i,j,1)-u0(i,jm,1)) *dyi) ) * dxi * anis_fac(k) & + & ( ekm(i,j,1) * (v0(i,jp,1)-v0(i,j,1)) & - -ekm(i,jm,1)* (v0(i,j,1)-v0(i,jm,1)) ) * 2. * dy2i & + -ekm(i,jm,1)* (v0(i,j,1)-v0(i,jm,1)) ) * 2. * dy2i * anis_fac(k) & + & ( rhobh(2)/rhobf(1) * eomp * ( (v0(i,j,2)-v0(i,j,1)) /dzh(2) & +(w0(i,j,2)-w0(i,jm,2)) *dyi) & diff --git a/src/modglobal.f90 b/src/modglobal.f90 index e550f64cf..8c594f118 100644 --- a/src/modglobal.f90 +++ b/src/modglobal.f90 @@ -216,7 +216,7 @@ module modglobal real :: xsize = -1 !< domain size in x-direction real :: ysize = -1 !< domain size in y-direction real, allocatable :: delta(:) !< (dx*dy*dz)**(1/3) - real, allocatable :: deltai(:) !< (dx*dy*dz)**(-1/3) + real, allocatable :: deltai(:) !< (dx*dy*dz)**(-1/3) or dzf**-1 for anisotropic diffusion logical :: leq = .true. !< switch for (non)-equidistant mode. logical :: lmomsubs = .false. !< switch to apply subsidence on the momentum or not @@ -433,7 +433,7 @@ subroutine initglobal do k=1,k1 delta(k) = (dx*dy*dzf(k))**(1./3.) - deltai(k) = 1./delta(k) + deltai(k) = 1./delta(k) !can be overruled in modsubgrid in case anisotropic diffusion is applied end do !-------------------------------------------------- diff --git a/src/modsubgrid.f90 b/src/modsubgrid.f90 index a407ba5d1..a2c485718 100644 --- a/src/modsubgrid.f90 +++ b/src/modsubgrid.f90 @@ -37,14 +37,14 @@ module modsubgrid contains subroutine initsubgrid - use modglobal, only : ih,i1,jh,j1,k1,delta,zf,fkar,pi + use modglobal, only : ih,i1,jh,j1,k1,delta,deltai,dx,dy,zf,dzf,fkar,pi use modmpi, only : myid implicit none integer :: k - real :: ceps, ch + real :: ceps real :: mlen call subgridnamelist @@ -56,9 +56,12 @@ subroutine initsubgrid allocate(sbshr(2-ih:i1+ih,2-jh:j1+jh,1)) allocate(sbbuo(2-ih:i1+ih,2-jh:j1+jh,1)) allocate(csz(k1)) + allocate(anis_fac(k1)) ! Initialize variables to avoid problems when not using subgrid scheme JvdD + ! Determination of subgrid constants is explained in De Roode et al. 2017 ekm=0.; ekh=0.; zlt=0.; sbdiss=0.; sbshr=0.; sbbuo=0.; csz=0. + anis_fac = 0. cm = cf / (2. * pi) * (1.5*alpha_kolm)**(-1.5) @@ -83,6 +86,21 @@ subroutine initsubgrid end do end if + if(lanisotrop) then + ! Anisotropic diffusion scheme https://doi.org/10.1029/2022MS003095 + ! length scale in TKE equation is delta z (private communication with Marat) + if ((dx.ne.dy) .and. myid == 0) stop "The anisotropic diffusion assumes dx=dy." + do k = 1,k1 + deltai (k) = 1./dzf(k) !overrules deltai (k) = 1/delta(k) as defined in initglobal + anis_fac (k) = (dx/dzf(k))**2 !assumes dx=dy. is used to enhance horizontal diffusion + end do + else + do k = 1,k1 + anis_fac (k) = 1. !horizontal = vertical diffusion + end do + endif + + if (myid==0) then write (6,*) 'cf = ',cf write (6,*) 'cm = ',cm @@ -107,7 +125,7 @@ subroutine subgridnamelist integer :: ierr namelist/NAMSUBGRID/ & - ldelta,lmason,cf,cn,Rigc,Prandtl,lsmagorinsky,cs,nmason,sgs_surface_fix,ch1 + ldelta,lmason,cf,cn,Rigc,Prandtl,lsmagorinsky,cs,nmason,sgs_surface_fix,ch1,lanisotrop if(myid==0)then open(ifnamopt,file=fname_options,status='old',iostat=ierr) @@ -115,12 +133,16 @@ subroutine subgridnamelist call checknamelisterror(ierr, ifnamopt, 'NAMSUBGRID') write(6 ,NAMSUBGRID) close(ifnamopt) + + if (lmason .and. .not. ldelta) stop "lmason = .true. requires ldelta = .true." + if (lmason .and. lanisotrop) stop "lmason = .true. is not compatible with lanisotropic = .true." end if call D_MPI_BCAST(ldelta ,1, 0,comm3d,mpierr) call D_MPI_BCAST(lmason ,1, 0,comm3d,mpierr) call D_MPI_BCAST(nmason ,1, 0,comm3d,mpierr) call D_MPI_BCAST(lsmagorinsky ,1, 0,comm3d,mpierr) + call D_MPI_BCAST(lanisotrop ,1, 0,comm3d,mpierr) call D_MPI_BCAST(cs ,1, 0,comm3d,mpierr) call D_MPI_BCAST(cf ,1, 0,comm3d,mpierr) call D_MPI_BCAST(cn ,1, 0,comm3d,mpierr) @@ -128,7 +150,6 @@ subroutine subgridnamelist call D_MPI_BCAST(Prandtl ,1, 0,comm3d,mpierr) call D_MPI_BCAST(sgs_surface_fix ,1, 0,comm3d,mpierr) call D_MPI_BCAST(ch1 ,1, 0,comm3d,mpierr) - end subroutine subgridnamelist subroutine subgrid @@ -288,37 +309,79 @@ subroutine closure end do ! do TKE scheme - else - do k=1,kmax - do j=2,j1 - do i=2,i1 - if (ldelta .or. (dthvdz(i,j,k)<=0)) then - zlt(i,j,k) = delta(k) - if (lmason) zlt(i,j,k) = (1. / zlt(i,j,k) ** nmason + 1. / ( fkar * (zf(k) + z0m(i,j)))**nmason) ** (-1./nmason) - ekm(i,j,k) = cm * zlt(i,j,k) * e120(i,j,k) - ekh(i,j,k) = (ch1 + ch2) * ekm(i,j,k) - - ekm(i,j,k) = max(ekm(i,j,k),ekmin) - ekh(i,j,k) = max(ekh(i,j,k),ekmin) - else - ! zlt(i,j,k) = min(delta(k),cn*e120(i,j,k)/sqrt(grav/thvf(k)*abs(dthvdz(i,j,k)))) - ! faster calculation: evaluate sqrt only if the second argument is actually smaller - zlt(i,j,k) = delta(k) - if ( grav*abs(dthvdz(i,j,k)) * delta(k)**2 > (cn*e120(i,j,k))**2 * thvf(k) ) then - zlt(i,j,k) = cn*e120(i,j,k)/sqrt(grav/thvf(k)*abs(dthvdz(i,j,k))) - end if - - if (lmason) zlt(i,j,k) = (1. / zlt(i,j,k) ** nmason + 1. / ( fkar * (zf(k) + z0m(i,j)))**nmason) ** (-1./nmason) - - ekm(i,j,k) = cm * zlt(i,j,k) * e120(i,j,k) - ekh(i,j,k) = (ch1 + ch2 * zlt(i,j,k)*deltai(k)) * ekm(i,j,k) - - ekm(i,j,k) = max(ekm(i,j,k),ekmin) - ekh(i,j,k) = max(ekh(i,j,k),ekmin) - endif - end do - end do - end do + else + ! choose one of ldelta, ldelta+lmason, lanisotropic, or none of them for Deardorff length scale adjustment + if (ldelta .and. .not. lmason) then + do k=1,kmax + do j=2,j1 + do i=2,i1 + zlt(i,j,k) = delta(k) + + ekm(i,j,k) = cm * zlt(i,j,k) * e120(i,j,k) + ekh(i,j,k) = ch * ekm(i,j,k) + + ekm(i,j,k) = max(ekm(i,j,k),ekmin) + ekh(i,j,k) = max(ekh(i,j,k),ekmin) + end do + end do + end do + else if (ldelta .and. lmason) then ! delta scheme with Mason length scale correction + do k=1,kmax + do j=2,j1 + do i=2,i1 + zlt(i,j,k) = delta(k) + zlt(i,j,k) = (1. / zlt(i,j,k) ** nmason + 1. / ( fkar * (zf(k) + z0m(i,j)))**nmason) ** (-1./nmason) + + ekm(i,j,k) = cm * zlt(i,j,k) * e120(i,j,k) + ekh(i,j,k) = ch * ekm(i,j,k) + + ekm(i,j,k) = max(ekm(i,j,k),ekmin) + ekh(i,j,k) = max(ekh(i,j,k),ekmin) + end do + end do + end do + else if (lanisotrop) then ! Anisotropic diffusion, https://doi.org/10.1029/2022MS003095 + do k=1,kmax + do j=2,j1 + do i=2,i1 + zlt(i,j,k) = dzf(k) + + ekm(i,j,k) = cm * zlt(i,j,k) * e120(i,j,k) + ekh(i,j,k) = ch * ekm(i,j,k) + + ekm(i,j,k) = max(ekm(i,j,k),ekmin) + ekh(i,j,k) = max(ekh(i,j,k),ekmin) + end do + end do + end do + else ! Deardorff lengthscale correction + do k=1,kmax + do j=2,j1 + do i=2,i1 + zlt(i,j,k) = delta(k) + + !original + !if (dthvdz(i,j,k) > 0) then + !zlt(i,j,k) = min(delta(k),cn*e120(i,j,k)/sqrt(grav/thvf(k)*abs(dthvdz(i,j,k)))) + !end if + + ! alternative without if + zlt(i,j,k) = min(delta(k), & + cn*e120(i,j,k) / sqrt( grav/thvf(k) * abs(dthvdz(i,j,k))) + & + delta(k) * (1.0-sign(1.0_field_r,dthvdz(i,j,k)))) + ! the final line is 0 if dthvdz(i,j,k) > 0, else 2*delta(k) + ! ensuring that zlt(i,j,k) = delta(k) when dthvdz < 0, as + ! in the original scheme. + + ekm(i,j,k) = cm * zlt(i,j,k) * e120(i,j,k) + ekh(i,j,k) = (ch1 + ch2 * zlt(i,j,k)*deltai(k)) * ekm(i,j,k) + + ekm(i,j,k) = max(ekm(i,j,k),ekmin) + ekh(i,j,k) = max(ekh(i,j,k),ekmin) + end do + end do + end do + end if end if !************************************************************* @@ -530,10 +593,10 @@ subroutine diffc (a_in,a_out,flux) a_out(i,j,k) = a_out(i,j,k) & + 0.5 * ( & ( (ekh(i+1,j,k)+ekh(i,j,k))*(a_in(i+1,j,k)-a_in(i,j,k)) & - -(ekh(i,j,k)+ekh(i-1,j,k))*(a_in(i,j,k)-a_in(i-1,j,k)))*dx2i & + -(ekh(i,j,k)+ekh(i-1,j,k))*(a_in(i,j,k)-a_in(i-1,j,k)))*dx2i * anis_fac(k) & + & ( (ekh(i,jp,k)+ekh(i,j,k)) *(a_in(i,jp,k)-a_in(i,j,k)) & - -(ekh(i,j,k)+ekh(i,jm,k)) *(a_in(i,j,k)-a_in(i,jm,k)) )*dy2i & + -(ekh(i,j,k)+ekh(i,jm,k)) *(a_in(i,j,k)-a_in(i,jm,k)) )*dy2i * anis_fac(k) & + & ( rhobh(kp)/rhobf(k) * (dzf(kp)*ekh(i,j,k) + dzf(k)*ekh(i,j,kp)) & * (a_in(i,j,kp)-a_in(i,j,k)) / dzh(kp)**2 & @@ -552,10 +615,10 @@ subroutine diffc (a_in,a_out,flux) a_out(i,j,1) = a_out(i,j,1) & + 0.5 * ( & ( (ekh(i+1,j,1)+ekh(i,j,1))*(a_in(i+1,j,1)-a_in(i,j,1)) & - -(ekh(i,j,1)+ekh(i-1,j,1))*(a_in(i,j,1)-a_in(i-1,j,1)) )*dx2i & + -(ekh(i,j,1)+ekh(i-1,j,1))*(a_in(i,j,1)-a_in(i-1,j,1)) )*dx2i * anis_fac(k) & + & ( (ekh(i,j+1,1)+ekh(i,j,1))*(a_in(i,j+1,1)-a_in(i,j,1)) & - -(ekh(i,j,1)+ekh(i,j-1,1))*(a_in(i,j,1)-a_in(i,j-1,1)) )*dy2i & + -(ekh(i,j,1)+ekh(i,j-1,1))*(a_in(i,j,1)-a_in(i,j-1,1)) )*dy2i * anis_fac(k) & + & ( rhobh(2)/rhobf(1) * (dzf(2)*ekh(i,j,1) + dzf(1)*ekh(i,j,2)) & * (a_in(i,j,2)-a_in(i,j,1)) / dzh(2)**2 & @@ -591,10 +654,10 @@ subroutine diffe(a_out) a_out(i,j,k) = a_out(i,j,k) & + ( & ((ekm(i+1,j,k)+ekm(i,j,k))*(e120(i+1,j,k)-e120(i,j,k)) & - -(ekm(i,j,k)+ekm(i-1,j,k))*(e120(i,j,k)-e120(i-1,j,k)))*dx2i & + -(ekm(i,j,k)+ekm(i-1,j,k))*(e120(i,j,k)-e120(i-1,j,k)))*dx2i * anis_fac(k) & + & ((ekm(i,jp,k)+ekm(i,j,k)) *(e120(i,jp,k)-e120(i,j,k)) & - -(ekm(i,j,k)+ekm(i,jm,k)) *(e120(i,j,k)-e120(i,jm,k)) )*dy2i & + -(ekm(i,j,k)+ekm(i,jm,k)) *(e120(i,j,k)-e120(i,jm,k)) )*dy2i * anis_fac(k) & + & (rhobh(kp)/rhobf(k) * (dzf(kp)*ekm(i,j,k) + dzf(k)*ekm(i,j,kp)) & *(e120(i,j,kp)-e120(i,j,k)) / dzh(kp)**2 & @@ -615,10 +678,10 @@ subroutine diffe(a_out) a_out(i,j,1) = a_out(i,j,1) + & ( (ekm(i+1,j,1)+ekm(i,j,1))*(e120(i+1,j,1)-e120(i,j,1)) & - -(ekm(i,j,1)+ekm(i-1,j,1))*(e120(i,j,1)-e120(i-1,j,1)) )*dx2i & + -(ekm(i,j,1)+ekm(i-1,j,1))*(e120(i,j,1)-e120(i-1,j,1)) )*dx2i * anis_fac(k) & + & ( (ekm(i,j+1,1)+ekm(i,j,1))*(e120(i,j+1,1)-e120(i,j,1)) & - -(ekm(i,j,1)+ekm(i,j-1,1))*(e120(i,j,1)-e120(i,j-1,1)) )*dy2i & + -(ekm(i,j,1)+ekm(i,j-1,1))*(e120(i,j,1)-e120(i,j-1,1)) )*dy2i * anis_fac(k) & + & ( rhobh(2)/rhobf(1) * (dzf(2)*ekm(i,j,1) + dzf(1)*ekm(i,j,2)) & * (e120(i,j,2)-e120(i,j,1)) / dzh(2)**2 )/dzf(1) @@ -670,12 +733,12 @@ subroutine diffu (a_out) a_out(i,j,k) = a_out(i,j,k) & + & ( ekm(i,j,k) * (u0(i+1,j,k)-u0(i,j,k)) & - -ekm(i-1,j,k)* (u0(i,j,k)-u0(i-1,j,k)) ) * 2. * dx2i & + -ekm(i-1,j,k)* (u0(i,j,k)-u0(i-1,j,k)) ) * 2. * dx2i * anis_fac(k) & + & ( empo * ( (u0(i,jp,k)-u0(i,j,k)) *dyi & +(v0(i,jp,k)-v0(i-1,jp,k))*dxi) & -emmo * ( (u0(i,j,k)-u0(i,jm,k)) *dyi & - +(v0(i,j,k)-v0(i-1,j,k)) *dxi) ) / dy & + +(v0(i,j,k)-v0(i-1,j,k)) *dxi) ) / dy * anis_fac(k) & + & ( rhobh(kp)/rhobf(k) * emop * ( (u0(i,j,kp)-u0(i,j,k)) /dzh(kp) & +(w0(i,j,kp)-w0(i-1,j,kp))*dxi) & @@ -723,12 +786,12 @@ subroutine diffu (a_out) a_out(i,j,1) = a_out(i,j,1) & + & ( ekm(i,j,1) * (u0(i+1,j,1)-u0(i,j,1)) & - -ekm(i-1,j,1)* (u0(i,j,1)-u0(i-1,j,1)) ) * 2. * dx2i & + -ekm(i-1,j,1)* (u0(i,j,1)-u0(i-1,j,1)) ) * 2. * dx2i * anis_fac(k) & + & ( empo * ( (u0(i,jp,1)-u0(i,j,1)) *dyi & +(v0(i,jp,1)-v0(i-1,jp,1))*dxi) & -emmo * ( (u0(i,j,1)-u0(i,jm,1)) *dyi & - +(v0(i,j,1)-v0(i-1,j,1)) *dxi) ) / dy & + +(v0(i,j,1)-v0(i-1,j,1)) *dxi) ) / dy * anis_fac(k) & + & ( rhobh(2)/rhobf(1) * emop * ( (u0(i,j,2)-u0(i,j,1)) /dzh(2) & +(w0(i,j,2)-w0(i-1,j,2)) *dxi) & @@ -783,10 +846,10 @@ subroutine diffv (a_out) ( epmo * ( (v0(i+1,j,k)-v0(i,j,k)) *dxi & +(u0(i+1,j,k)-u0(i+1,jm,k))*dyi) & -emmo * ( (v0(i,j,k)-v0(i-1,j,k)) *dxi & - +(u0(i,j,k)-u0(i,jm,k)) *dyi) ) / dx & + +(u0(i,j,k)-u0(i,jm,k)) *dyi) ) / dx * anis_fac(k) & + & (ekm(i,j,k) * (v0(i,jp,k)-v0(i,j,k)) & - -ekm(i,jm,k)* (v0(i,j,k)-v0(i,jm,k)) ) * 2. * dy2i & + -ekm(i,jm,k)* (v0(i,j,k)-v0(i,jm,k)) ) * 2. * dy2i * anis_fac(k) & + & ( rhobh(kp)/rhobf(k) * eomp * ( (v0(i,j,kp)-v0(i,j,k)) /dzh(kp) & +(w0(i,j,kp)-w0(i,jm,kp)) *dyi) & @@ -833,10 +896,10 @@ subroutine diffv (a_out) ( epmo * ( (v0(i+1,j,1)-v0(i,j,1)) *dxi & +(u0(i+1,j,1)-u0(i+1,jm,1))*dyi) & -emmo * ( (v0(i,j,1)-v0(i-1,j,1)) *dxi & - +(u0(i,j,1)-u0(i,jm,1)) *dyi) ) / dx & + +(u0(i,j,1)-u0(i,jm,1)) *dyi) ) / dx * anis_fac(k) & + & ( ekm(i,j,1) * (v0(i,jp,1)-v0(i,j,1)) & - -ekm(i,jm,1)* (v0(i,j,1)-v0(i,jm,1)) ) * 2. * dy2i & + -ekm(i,jm,1)* (v0(i,j,1)-v0(i,jm,1)) ) * 2. * dy2i * anis_fac(k) & + & ( rhobh(2)/rhobf(1) * eomp * ( (v0(i,j,2)-v0(i,j,1)) /dzh(2) & +(w0(i,j,2)-w0(i,jm,2)) *dyi) & @@ -891,12 +954,12 @@ subroutine diffw(a_out) ( epom * ( (w0(i+1,j,k)-w0(i,j,k)) *dxi & +(u0(i+1,j,k)-u0(i+1,j,km)) /dzh(k) ) & -emom * ( (w0(i,j,k)-w0(i-1,j,k)) *dxi & - +(u0(i,j,k)-u0(i,j,km)) /dzh(k) ))/dx & + +(u0(i,j,k)-u0(i,j,km)) /dzh(k) ))/dx * anis_fac(k) & + & ( eopm * ( (w0(i,jp,k)-w0(i,j,k)) *dyi & +(v0(i,jp,k)-v0(i,jp,km)) /dzh(k) ) & -eomm * ( (w0(i,j,k)-w0(i,jm,k)) *dyi & - +(v0(i,j,k)-v0(i,j,km)) /dzh(k) ))/dy & + +(v0(i,j,k)-v0(i,j,km)) /dzh(k) ))/dy * anis_fac(k) & + (1./rhobh(k))*& ( rhobf(k) * ekm(i,j,k) * (w0(i,j,kp)-w0(i,j,k)) /dzf(k) & - rhobf(km) * ekm(i,j,km)* (w0(i,j,k)-w0(i,j,km)) /dzf(km) ) * 2. & diff --git a/src/modsubgriddata.f90 b/src/modsubgriddata.f90 index 81569c36b..8f8ad4fb2 100644 --- a/src/modsubgriddata.f90 +++ b/src/modsubgriddata.f90 @@ -35,6 +35,8 @@ module modsubgriddata logical :: ldelta = .false. !< switch for subgrid length formulation (on/off) logical :: lmason = .false. !< switch for decreased length scale near the surface logical :: lsmagorinsky = .false. !< switch for smagorinsky subgrid scheme + logical :: lanisotrop = .false. !< switch for anisotropic diffusion + real :: cf = 2.5 !< filter constant real :: Rigc = 0.25 !< critical Richardson number real :: Prandtl = (1.0/3.0) @@ -42,6 +44,7 @@ module modsubgriddata real :: cn = 0.76 real :: ch1 = 1. real :: ch2 = 2. + real :: ch ! ch = ch1 + ch2 real :: ce1 = 0.19 real :: ce2 = 0.51 real :: cs = -1 @@ -60,5 +63,6 @@ module modsubgriddata real, allocatable :: csz(:) !< Smagorinsky constant + real, allocatable :: anis_fac(:) !< grid anisotropy factor end module modsubgriddata From f23bb5a8de0300f8c77c2b876f9cc38b9491129b Mon Sep 17 00:00:00 2001 From: Caspar Jungbacker <caspar.jungbacker@outlook.com> Date: Fri, 9 Jun 2023 10:26:08 +0200 Subject: [PATCH 04/19] Clean up modsubgrid - Separated the calculations for the first level in the Smagorinsky model. This improves data parallelism. - Removed statements like km=k-1 --- src/modsubgrid.f90 | 142 +++++++++++++++++++++++---------------------- 1 file changed, 72 insertions(+), 70 deletions(-) diff --git a/src/modsubgrid.f90 b/src/modsubgrid.f90 index ab5598061..1b55b3b5c 100644 --- a/src/modsubgrid.f90 +++ b/src/modsubgrid.f90 @@ -205,78 +205,80 @@ subroutine closure integer :: i,j,k,kp,km,jp,jm if(lsmagorinsky) then - do k = 1,kmax - mlen = csz(k) * delta(k) - + ! First level + mlen = csz(1) * delta(1) + do i = 2,i1 + do j = 2,j1 + strain2 = ( & + ((u0(i+1,j,1)-u0(i,j,1)) *dxi )**2 + & + ((v0(i,j+1,1)-v0(i,j,1)) *dyi )**2 + & + ((w0(i,j,2)-w0(i,j,1)) /dzf(1) )**2 ) + + strain2 = strain2 + 0.5 * ( & + ( 0.25*(w0(i+1,j,2)-w0(i-1,j,2))*dxi + & + dudz(i,j) )**2 ) + + strain2 = strain2 + 0.125 * ( & + ((u0(i,j+1,1)-u0(i,j,1)) *dyi + & + (v0(i,j+1,1)-v0(i-1,j+1,1)) *dxi )**2 + & + ((u0(i,j,1)-u0(i,j-1,1)) *dyi + & + (v0(i,j,1)-v0(i-1,j,1)) *dxi )**2 + & + ((u0(i+1,j,1)-u0(i+1,j-1,1)) *dyi + & + (v0(i+1,j,1)-v0(i,j,1)) *dxi )**2 + & + ((u0(i+1,j+1,1)-u0(i+1,j,1)) *dyi + & + (v0(i+1,j+1,1)-v0(i,j+1,1)) *dxi )**2 ) + + strain2 = strain2 + 0.5 * ( & + (0.25*(w0(i,j+1,2)-w0(i,j-1,2))*dyi + & + dvdz(i,j) )**2 ) + + ekm(i,j,1) = mlen ** 2. * sqrt(2. * strain2) + ekh(i,j,1) = ekm(i,j,1) / Prandtl + + ekm(i,j,1) = max(ekm(i,j,1),ekmin) + ekh(i,j,1) = max(ekh(i,j,1),ekmin) + end do + end do + + ! Other levels + do k = 2,kmax + mlen = csz(k) * delta(k) do i = 2,i1 do j = 2,j1 - - kp=k+1 - km=k-1 - jp=j+1 - jm=j-1 - - if(k == 1) then - strain2 = ( & - ((u0(i+1,j,k)-u0(i,j,k)) *dxi )**2 + & - ((v0(i,jp,k)-v0(i,j,k)) *dyi )**2 + & - ((w0(i,j,kp)-w0(i,j,k)) /dzf(k) )**2 ) - - strain2 = strain2 + 0.5 * ( & - ( 0.25*(w0(i+1,j,kp)-w0(i-1,j,kp))*dxi + & - dudz(i,j) )**2 ) - - strain2 = strain2 + 0.125 * ( & - ((u0(i,jp,k)-u0(i,j,k)) *dyi + & - (v0(i,jp,k)-v0(i-1,jp,k)) *dxi )**2 + & - ((u0(i,j,k)-u0(i,jm,k)) *dyi + & - (v0(i,j,k)-v0(i-1,j,k)) *dxi )**2 + & - ((u0(i+1,j,k)-u0(i+1,jm,k)) *dyi + & - (v0(i+1,j,k)-v0(i,j,k)) *dxi )**2 + & - ((u0(i+1,jp,k)-u0(i+1,j,k)) *dyi + & - (v0(i+1,jp,k)-v0(i,jp,k)) *dxi )**2 ) - - strain2 = strain2 + 0.5 * ( & - ( 0.25*(w0(i,jp,kp)-w0(i,jm,kp))*dyi + & - dvdz(i,j) )**2 ) - - else - - strain2 = ( & - ((u0(i+1,j,k)-u0(i,j,k)) *dxi )**2 + & - ((v0(i,jp,k)-v0(i,j,k)) *dyi )**2 + & - ((w0(i,j,kp)-w0(i,j,k)) /dzf(k) )**2 ) - - strain2 = strain2 + 0.125 * ( & - ((w0(i,j,kp)-w0(i-1,j,kp)) *dxi + & - (u0(i,j,kp)-u0(i,j,k)) / dzh(kp) )**2 + & - ((w0(i,j,k)-w0(i-1,j,k)) *dxi + & - (u0(i,j,k)-u0(i,j,km)) / dzh(k) )**2 + & - ((w0(i+1,j,k)-w0(i,j,k)) *dxi + & - (u0(i+1,j,k)-u0(i+1,j,km)) / dzh(k) )**2 + & - ((w0(i+1,j,kp)-w0(i,j,kp)) *dxi + & - (u0(i+1,j,kp)-u0(i+1,j,k)) / dzh(kp) )**2 ) - - strain2 = strain2 + 0.125 * ( & - ((u0(i,jp,k)-u0(i,j,k)) *dyi + & - (v0(i,jp,k)-v0(i-1,jp,k)) *dxi )**2 + & - ((u0(i,j,k)-u0(i,jm,k)) *dyi + & - (v0(i,j,k)-v0(i-1,j,k)) *dxi )**2 + & - ((u0(i+1,j,k)-u0(i+1,jm,k)) *dyi + & - (v0(i+1,j,k)-v0(i,j,k)) *dxi )**2 + & - ((u0(i+1,jp,k)-u0(i+1,j,k)) *dyi + & - (v0(i+1,jp,k)-v0(i,jp,k)) *dxi )**2 ) - - strain2 = strain2 + 0.125 * ( & - ((v0(i,j,kp)-v0(i,j,k)) / dzh(kp) + & - (w0(i,j,kp)-w0(i,jm,kp)) *dyi )**2 + & - ((v0(i,j,k)-v0(i,j,km)) / dzh(k)+ & - (w0(i,j,k)-w0(i,jm,k)) *dyi )**2 + & - ((v0(i,jp,k)-v0(i,jp,km)) / dzh(k)+ & - (w0(i,jp,k)-w0(i,j,k)) *dyi )**2 + & - ((v0(i,jp,kp)-v0(i,jp,k)) / dzh(kp) + & - (w0(i,jp,kp)-w0(i,j,kp)) *dyi )**2 ) - end if + strain2 = ( & + ((u0(i+1,j,k)-u0(i,j,k)) *dxi )**2 + & + ((v0(i,j+1,k)-v0(i,j,k)) *dyi )**2 + & + ((w0(i,j,k+1)-w0(i,j,k)) /dzf(k) )**2 ) + + strain2 = strain2 + 0.125 * ( & + ((w0(i,j,k+1)-w0(i-1,j,k+1)) *dxi + & + (u0(i,j,k+1)-u0(i,j,k)) /dzh(k+1) )**2 + & + ((w0(i,j,k)-w0(i-1,j,k)) *dxi + & + (u0(i,j,k)-u0(i,j,k-1)) /dzh(k) )**2 + & + ((w0(i+1,j,k)-w0(i,j,k)) *dxi + & + (u0(i+1,j,k)-u0(i+1,j,k-1)) /dzh(k) )**2 + & + ((w0(i+1,j,k+1)-w0(i,j,k+1)) *dxi + & + (u0(i+1,j,k+1)-u0(i+1,j,k)) /dzh(k+1) )**2 ) + + strain2 = strain2 + 0.125 * ( & + ((u0(i,j+1,k)-u0(i,j,k)) *dyi + & + (v0(i,j+1,k)-v0(i-1,j+1,k)) *dxi )**2 + & + ((u0(i,j,k)-u0(i,j-1,k)) *dyi + & + (v0(i,j,k)-v0(i-1,j,k)) *dxi )**2 + & + ((u0(i+1,j,k)-u0(i+1,j-1,k)) *dyi + & + (v0(i+1,j,k)-v0(i,j,k)) *dxi )**2 + & + ((u0(i+1,j+1,k)-u0(i+1,j,k)) *dyi + & + (v0(i+1,j+1,k)-v0(i,j+1,k)) *dxi )**2 ) + + strain2 = strain2 + 0.125 * ( & + ((v0(i,j,k+1)-v0(i,j,k)) /dzh(k+1) + & + (w0(i,j,k+1)-w0(i,j-1,k+1)) *dyi )**2 + & + ((v0(i,j,k)-v0(i,j,k-1)) /dzh(k) + & + (w0(i,j,k)-w0(i,j-1,k)) *dyi )**2 + & + ((v0(i,j+1,k)-v0(i,j+1,k-1)) /dzh(k) + & + (w0(i,j+1,k)-w0(i,j,k)) *dyi )**2 + & + ((v0(i,j+1,k+1)-v0(i,j+1,k)) /dzh(k+1) + & + (w0(i,j+1,k+1)-w0(i,j,k+1)) *dyi )**2 ) ekm(i,j,k) = mlen ** 2. * sqrt(2. * strain2) ekh(i,j,k) = ekm(i,j,k) / Prandtl From b246e558008e10b24fd68bb7a0d3f25a0d485d51 Mon Sep 17 00:00:00 2001 From: Caspar Jungbacker <caspar.jungbacker@outlook.com> Date: Fri, 9 Jun 2023 12:13:00 +0200 Subject: [PATCH 05/19] Add spaces [skip ci] --- src/modsubgrid.f90 | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/modsubgrid.f90 b/src/modsubgrid.f90 index 1b55b3b5c..d2b034a89 100644 --- a/src/modsubgrid.f90 +++ b/src/modsubgrid.f90 @@ -272,13 +272,13 @@ subroutine closure strain2 = strain2 + 0.125 * ( & ((v0(i,j,k+1)-v0(i,j,k)) /dzh(k+1) + & - (w0(i,j,k+1)-w0(i,j-1,k+1)) *dyi )**2 + & + (w0(i,j,k+1)-w0(i,j-1,k+1)) *dyi )**2 + & ((v0(i,j,k)-v0(i,j,k-1)) /dzh(k) + & - (w0(i,j,k)-w0(i,j-1,k)) *dyi )**2 + & + (w0(i,j,k)-w0(i,j-1,k)) *dyi )**2 + & ((v0(i,j+1,k)-v0(i,j+1,k-1)) /dzh(k) + & - (w0(i,j+1,k)-w0(i,j,k)) *dyi )**2 + & + (w0(i,j+1,k)-w0(i,j,k)) *dyi )**2 + & ((v0(i,j+1,k+1)-v0(i,j+1,k)) /dzh(k+1) + & - (w0(i,j+1,k+1)-w0(i,j,k+1)) *dyi )**2 ) + (w0(i,j+1,k+1)-w0(i,j,k+1)) *dyi )**2 ) ekm(i,j,k) = mlen ** 2. * sqrt(2. * strain2) ekh(i,j,k) = ekm(i,j,k) / Prandtl From 457d34aa59d3a1e0e43a9a172692657064ee1428 Mon Sep 17 00:00:00 2001 From: Caspar Jungbacker <caspar.jungbacker@outlook.com> Date: Sat, 10 Jun 2023 18:29:51 +0200 Subject: [PATCH 06/19] Remove unused variables --- src/modsubgrid.f90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/modsubgrid.f90 b/src/modsubgrid.f90 index 1f25b2423..aed86c371 100644 --- a/src/modsubgrid.f90 +++ b/src/modsubgrid.f90 @@ -223,7 +223,7 @@ subroutine closure implicit none real :: strain2,mlen - integer :: i,j,k,kp,km,jp,jm + integer :: i,j,k if(lsmagorinsky) then ! First level From f46fa37d441b16fe6e478fd4def7f78125c45880 Mon Sep 17 00:00:00 2001 From: Caspar Jungbacker <caspar.jungbacker@outlook.com> Date: Sun, 11 Jun 2023 20:28:18 +0200 Subject: [PATCH 07/19] Temporary fix for halo transfers --- src/modmpi.f90 | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/src/modmpi.f90 b/src/modmpi.f90 index 5564689d8..079281cb5 100644 --- a/src/modmpi.f90 +++ b/src/modmpi.f90 @@ -35,6 +35,9 @@ module modmpi use modmpiinterface +#ifdef _OPENACC +use openacc +#endif implicit none save type(MPI_COMM) :: commwrld, comm3d, commrow, commcol @@ -344,8 +347,14 @@ subroutine excjs_real32(a,sx,ex,sy,ey,sz,ez,ih,jh) , sends,recvs & , sende,recve & , sendw,recvw + ! Check if the data is on the gpu + #ifdef _OPENACC + integer :: is_present + is_present = acc_is_present(a) + #endif ! Calulate buffer lengths + !$acc kernels if(is_present) async xl = size(a,1) yl = size(a,2) zl = size(a,3) @@ -382,8 +391,10 @@ subroutine excjs_real32(a,sx,ex,sy,ey,sz,ez,ih,jh) else ! Single processor, make sure the field is periodic + !$acc kernels default(present) a(:,sy-jh:sy-1,:) = a(:,ey-jh+1:ey,:) a(:,ey+1:ey+jh,:) = a(:,sy:sy+jh-1,:) + !$acc end kernels endif @@ -414,9 +425,10 @@ subroutine excjs_real32(a,sx,ex,sy,ey,sz,ez,ih,jh) else ! Single processor, make sure the field is periodic + !$acc kernels default(present) a(sx-ih:sx-1,:,:) = a(ex-ih+1:ex,:,:) a(ex+1:ex+ih,:,:) = a(sx:sx+ih-1,:,:) - + !$acc end kernels endif if(nprocy.gt.1)then @@ -456,11 +468,18 @@ subroutine excjs_real64(a,sx,ex,sy,ey,sz,ez,ih,jh) , sends,recvs & , sende,recve & , sendw,recvw + ! Check if the data is on the gpu + #ifdef _OPENACC + integer :: is_present + is_present = acc_is_present(a) + #endif ! Calulate buffer lengths + !$acc kernels default(present) if(is_present) async(1) xl = size(a,1) yl = size(a,2) zl = size(a,3) + !$acc end kernels ! Calculate buffer size nssize = xl*jh*zl @@ -495,8 +514,10 @@ subroutine excjs_real64(a,sx,ex,sy,ey,sz,ez,ih,jh) else ! Single processor, make sure the field is periodic + !$acc kernels default(present) if(is_present) async(1) a(:,sy-jh:sy-1,:) = a(:,ey-jh+1:ey,:) a(:,ey+1:ey+jh,:) = a(:,sy:sy+jh-1,:) + !$acc end kernels endif @@ -527,8 +548,10 @@ subroutine excjs_real64(a,sx,ex,sy,ey,sz,ez,ih,jh) else ! Single processor, make sure the field is periodic + !$acc kernels default(present) if(is_present) async(1) a(sx-ih:sx-1,:,:) = a(ex-ih+1:ex,:,:) a(ex+1:ex+ih,:,:) = a(sx:sx+ih-1,:,:) + !$acc end kernels endif From edd82bc585be3b30c8e3b4b709f0a671d0619db4 Mon Sep 17 00:00:00 2001 From: Caspar Jungbacker <caspar.jungbacker@outlook.com> Date: Mon, 12 Jun 2023 17:03:55 +0200 Subject: [PATCH 08/19] Add temporary timers --- src/modsubgrid.f90 | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/modsubgrid.f90 b/src/modsubgrid.f90 index aed86c371..9e58d9c24 100644 --- a/src/modsubgrid.f90 +++ b/src/modsubgrid.f90 @@ -31,6 +31,7 @@ module modsubgrid use modsubgriddata use modprecision, only: field_r +use modtimer, only: timer_tic, timer_toc implicit none save public :: subgrid, initsubgrid, exitsubgrid, subgridnamelist @@ -162,8 +163,10 @@ subroutine subgrid use modsurfdata,only : thlflux,qtflux,svflux implicit none integer n - + + call timer_tic("closure", 0) call closure + call timer_toc("closure") call diffu(up) call diffv(vp) call diffw(wp) @@ -226,6 +229,7 @@ subroutine closure integer :: i,j,k if(lsmagorinsky) then + call timer_tic("Smagorinsky", 1) ! First level mlen = csz(1) * delta(1) do i = 2,i1 @@ -309,7 +313,7 @@ subroutine closure end do end do end do - + call timer_toc("Smagorinsky") ! do TKE scheme else ! choose one of ldelta, ldelta+lmason, lanisotropic, or none of them for Deardorff length scale adjustment @@ -385,20 +389,22 @@ subroutine closure end do end if end if - !************************************************************* ! Set cyclic boundary condition for K-closure factors. !************************************************************* - + call timer_tic("Boundary conditions", 2) call excjs( ekm , 2,i1,2,j1,1,k1,ih,jh) call excjs( ekh , 2,i1,2,j1,1,k1,ih,jh) + call timer_toc("Boundary conditions") + call timer_tic("Write buffer", 3) do j=1,j2 do i=1,i2 ekm(i,j,k1) = ekm(i,j,kmax) ekh(i,j,k1) = ekh(i,j,kmax) end do end do + call timer_toc("Write buffer") return end subroutine closure From c9ac6194241a27bec24501782f7530030ad3fbad Mon Sep 17 00:00:00 2001 From: Caspar Jungbacker <caspar.jungbacker@outlook.com> Date: Mon, 12 Jun 2023 20:46:30 +0200 Subject: [PATCH 09/19] Close kernel region --- src/modmpi.f90 | 1 + 1 file changed, 1 insertion(+) diff --git a/src/modmpi.f90 b/src/modmpi.f90 index 079281cb5..eca1166d5 100644 --- a/src/modmpi.f90 +++ b/src/modmpi.f90 @@ -358,6 +358,7 @@ subroutine excjs_real32(a,sx,ex,sy,ey,sz,ez,ih,jh) xl = size(a,1) yl = size(a,2) zl = size(a,3) + !$acc end kernels ! Calculate buffer size nssize = xl*jh*zl From 822cb9991c6bb2dfe9e9a0992fdbed1217dc7703 Mon Sep 17 00:00:00 2001 From: Caspar Jungbacker <caspar.jungbacker@outlook.com> Date: Tue, 13 Jun 2023 09:25:15 +0200 Subject: [PATCH 10/19] Clean up subgrid --- src/modsubgrid.f90 | 491 +++++++++++++++++++++++++-------------------- 1 file changed, 270 insertions(+), 221 deletions(-) diff --git a/src/modsubgrid.f90 b/src/modsubgrid.f90 index 9e58d9c24..334541b5e 100644 --- a/src/modsubgrid.f90 +++ b/src/modsubgrid.f90 @@ -442,56 +442,49 @@ subroutine sources real tdef2, uwflux, vwflux, local_dudz, local_dvdz, local_dthvdz, horv integer i,j,k,jm,jp,km,kp - do k=2,kmax - do j=2,j1 - do i=2,i1 - kp=k+1 - km=k-1 - jp=j+1 - jm=j-1 - - tdef2 = 2. * ( & - ((u0(i+1,j,k)-u0(i,j,k)) /dx )**2 + & - ((v0(i,jp,k)-v0(i,j,k)) /dy )**2 + & - ((w0(i,j,kp)-w0(i,j,k)) /dzf(k) )**2 ) - - tdef2 = tdef2 + 0.25 * ( & - ((w0(i,j,kp)-w0(i-1,j,kp)) / dx + & - (u0(i,j,kp)-u0(i,j,k)) / dzh(kp) )**2 + & - ((w0(i,j,k)-w0(i-1,j,k)) / dx + & - (u0(i,j,k)-u0(i,j,km)) / dzh(k) )**2 + & - ((w0(i+1,j,k)-w0(i,j,k)) / dx + & - (u0(i+1,j,k)-u0(i+1,j,km)) / dzh(k) )**2 + & - ((w0(i+1,j,kp)-w0(i,j,kp)) / dx + & - (u0(i+1,j,kp)-u0(i+1,j,k)) / dzh(kp) )**2 ) - - tdef2 = tdef2 + 0.25 * ( & - ((u0(i,jp,k)-u0(i,j,k)) / dy + & - (v0(i,jp,k)-v0(i-1,jp,k)) / dx )**2 + & - ((u0(i,j,k)-u0(i,jm,k)) / dy + & - (v0(i,j,k)-v0(i-1,j,k)) / dx )**2 + & - ((u0(i+1,j,k)-u0(i+1,jm,k)) / dy + & - (v0(i+1,j,k)-v0(i,j,k)) / dx )**2 + & - ((u0(i+1,jp,k)-u0(i+1,j,k)) / dy + & - (v0(i+1,jp,k)-v0(i,jp,k)) / dx )**2 ) - - tdef2 = tdef2 + 0.25 * ( & - ((v0(i,j,kp)-v0(i,j,k)) / dzh(kp) + & - (w0(i,j,kp)-w0(i,jm,kp)) / dy )**2 + & - ((v0(i,j,k)-v0(i,j,km)) / dzh(k)+ & - (w0(i,j,k)-w0(i,jm,k)) / dy )**2 + & - ((v0(i,jp,k)-v0(i,jp,km)) / dzh(k)+ & - (w0(i,jp,k)-w0(i,j,k)) / dy )**2 + & - ((v0(i,jp,kp)-v0(i,jp,k)) / dzh(kp) + & - (w0(i,jp,kp)-w0(i,j,kp)) / dy )**2 ) - - e12p(i,j,k) = e12p(i,j,k) & - + (ekm(i,j,k)*tdef2 - ekh(i,j,k)*grav/thvf(k)*dthvdz(i,j,k) ) / (2*e120(i,j,k)) & ! sbshr and sbbuo - - (ce1 + ce2*zlt(i,j,k)*deltai(k)) * e120(i,j,k)**2 /(2.*zlt(i,j,k)) ! sbdiss - - end do - end do + do j=2,j1 + do i=2,i1 + tdef2 = 2. * ( & + ((u0(i+1,j,k)-u0(i,j,k)) /dx )**2 + & + ((v0(i,j+1,k)-v0(i,j,k)) /dy )**2 + & + ((w0(i,j,k+1)-w0(i,j,k)) /dzf(k) )**2 ) + + tdef2 = tdef2 + 0.25 * ( & + ((w0(i,j,k+1)-w0(i-1,j,k+1)) / dx + & + (u0(i,j,k+1)-u0(i,j,k)) / dzh(k+1) )**2 + & + ((w0(i,j,k)-w0(i-1,j,k)) / dx + & + (u0(i,j,k)-u0(i,j,k-1)) / dzh(k) )**2 + & + ((w0(i+1,j,k)-w0(i,j,k)) / dx + & + (u0(i+1,j,k)-u0(i+1,j,k-1)) / dzh(k) )**2 + & + ((w0(i+1,j,k+1)-w0(i,j,k+1)) / dx + & + (u0(i+1,j,k+1)-u0(i+1,j,k)) / dzh(k+1) )**2 ) + + tdef2 = tdef2 + 0.25 * ( & + ((u0(i,j+1,k)-u0(i,j,k)) / dy + & + (v0(i,j+1,k)-v0(i-1,j+1,k)) / dx )**2 + & + ((u0(i,j,k)-u0(i,j-1,k)) / dy + & + (v0(i,j,k)-v0(i-1,j,k)) / dx )**2 + & + ((u0(i+1,j,k)-u0(i+1,j-1,k)) / dy + & + (v0(i+1,j,k)-v0(i,j,k)) / dx )**2 + & + ((u0(i+1,j+1,k)-u0(i+1,j,k)) / dy + & + (v0(i+1,j+1,k)-v0(i,j+1,k)) / dx )**2 ) + + tdef2 = tdef2 + 0.25 * ( & + ((v0(i,j,k+1)-v0(i,j,k)) / dzh(k+1) + & + (w0(i,j,k+1)-w0(i,j-1,k+1)) / dy )**2 + & + ((v0(i,j,k)-v0(i,j,k-1)) / dzh(k) + & + (w0(i,j,k)-w0(i,j-1,k)) / dy )**2 + & + ((v0(i,j+1,k)-v0(i,j+1,k-1)) / dzh(k) + & + (w0(i,j+1,k)-w0(i,j,k)) / dy )**2 + & + ((v0(i,j+1,k+1)-v0(i,j+1,k)) / dzh(k+1) + & + (w0(i,j+1,k+1)-w0(i,j,k+1)) / dy )**2 ) + + e12p(i,j,k) = e12p(i,j,k) & + + (ekm(i,j,k)*tdef2 - ekh(i,j,k)*grav/thvf(k)*dthvdz(i,j,k) ) / (2*e120(i,j,k)) & ! sbshr and sbbuo + - (ce1 + ce2*zlt(i,j,k)*deltai(k)) * e120(i,j,k)**2 /(2.*zlt(i,j,k)) ! sbdiss + end do + end do end do ! ----------------------------------------------end i,j,k-loop @@ -499,77 +492,148 @@ subroutine sources ! special treatment for lowest full level: k=1 ! -------------------------------------------- - - do j=2,j1 - jp=j+1 - jm=j-1 - do i=2,i1 - - - -! ** Calculate "shear" production term: tdef2 **************** - - tdef2 = 2. * ( & - ((u0(i+1,j,1)-u0(i,j,1))*dxi)**2 & - + ((v0(i,jp,1)-v0(i,j,1))*dyi)**2 & - + ((w0(i,j,2)-w0(i,j,1))/dzf(1))**2 ) - - if (sgs_surface_fix) then - ! Use known surface flux and exchange coefficient to derive - ! consistent gradient (such that correct flux will occur in - ! shear production term) - ! Make sure that no division by zero occurs in determination of the - ! directional component; ekm should already be >= ekmin - ! Replace the dudz by surface flux -uw / ekm - horv = max(sqrt((u0(i,j,1)+cu)**2+(v0(i,j,1)+cv)**2), 0.01) - uwflux = -ustar(i,j)*ustar(i,j)* ((u0(i,j,1)+cu)/horv) - local_dudz = -uwflux / ekm(i,j,1) - tdef2 = tdef2 + ( 0.25*(w0(i+1,j,2)-w0(i-1,j,2))*dxi + & - local_dudz )**2 - else - tdef2 = tdef2 + ( 0.25*(w0(i+1,j,2)-w0(i-1,j,2))*dxi + & - dudz(i,j) )**2 - endif - - tdef2 = tdef2 + 0.25 *( & - ((u0(i,jp,1)-u0(i,j,1))*dyi+(v0(i,jp,1)-v0(i-1,jp,1))*dxi)**2 & - +((u0(i,j,1)-u0(i,jm,1))*dyi+(v0(i,j,1)-v0(i-1,j,1))*dxi)**2 & - +((u0(i+1,j,1)-u0(i+1,jm,1))*dyi+(v0(i+1,j,1)-v0(i,j,1))*dxi)**2 & - +((u0(i+1,jp,1)-u0(i+1,j,1))*dyi+ & - (v0(i+1,jp,1)-v0(i,jp,1))*dxi)**2 ) - - if (sgs_surface_fix) then - ! Use known surface flux and exchange coefficient to derive - ! consistent gradient (such that correct flux will occur in - ! shear production term) - ! Make sure that no division by zero occurs in determination of the - ! directional component; ekm should already be >= ekmin - ! Replace the dvdz by surface flux -vw / ekm - horv = max(sqrt((u0(i,j,1)+cu)**2+(v0(i,j,1)+cv)**2), 0.01) - vwflux = -ustar(i,j)*ustar(i,j)* ((v0(i,j,1)+cv)/horv) - local_dvdz = -vwflux / ekm(i,j,1) - tdef2 = tdef2 + ( 0.25*(w0(i,jp,2)-w0(i,jm,2))*dyi + & - local_dvdz )**2 - else - tdef2 = tdef2 + ( 0.25*(w0(i,jp,2)-w0(i,jm,2))*dyi + & + if (sgs_surface_fix) then + do j=2,j1 + do i=2,i1 + tdef2 = 2. * ( & + ((u0(i+1,j,1)-u0(i,j,1)) *dxi )**2 & + + ((v0(i,j+1,1)-v0(i,j,1)) *dyi )**2 & + + ((w0(i,j,2)-w0(i,j,1)) /dzf(1) )**2 ) + ! Use known surface flux and exchange coefficient to derive + ! consistent gradient (such that correct flux will occur in + ! shear production term) + ! Make sure that no division by zero occurs in determination of the + ! directional component; ekm should already be >= ekmin + ! Replace the dudz by surface flux -uw / ekm + horv = max(sqrt((u0(i,j,1)+cu)**2 + (v0(i,j,1)+cv)**2), 0.01) + uwflux = -ustar(i,j)*ustar(i,j) * ((u0(i,j,1)+cu)/horv) + local_dudz = -uwflux / ekm(i,j,1) + + tdef2 = tdef2 + ( 0.25*(w0(i+1,j,2)-w0(i-1,j,2))*dxi + & + local_dudz )**2 + + tdef2 = tdef2 + 0.25 *( & + ((u0(i,j+1,1)-u0(i,j,1))*dyi+(v0(i,j+1,1)-v0(i-1,j+1,1))*dxi)**2 & + +((u0(i,j,1)-u0(i,j-1,1))*dyi+(v0(i,j,1)-v0(i-1,j,1))*dxi)**2 & + +((u0(i+1,j,1)-u0(i+1,j-1,1))*dyi+(v0(i+1,j,1)-v0(i,j,1))*dxi)**2 & + +((u0(i+1,j+1,1)-u0(i+1,j,1))*dyi & + +(v0(i+1,j+1,1)-v0(i,j+1,1))*dxi)**2 ) + + ! Use known surface flux and exchange coefficient to derive + ! consistent gradient (such that correct flux will occur in + ! shear production term) + ! Make sure that no division by zero occurs in determination of the + ! directional component; ekm should already be >= ekmin + ! Replace the dvdz by surface flux -vw / ekm + vwflux = -ustar(i,j)*ustar(i,j)* ((v0(i,j,1)+cv)/horv) + local_dvdz = -vwflux / ekm(i,j,1) + tdef2 = tdef2 + ( 0.25*(w0(i,j+1,2)-w0(i,j-1,2))*dyi + & + local_dvdz )**2 + + ! ** Include shear and buoyancy production terms and dissipation ** + sbshr(i,j,1) = ekm(i,j,1)*tdef2/ ( 2*e120(i,j,1)) + + ! Replace the -ekh * dthvdz by the surface flux of thv + ! (but we only have the thlflux , which seems at the surface to be + ! equivalent + local_dthvdz = -thlflux(i,j)/ekh(i,j,1) + sbbuo(i,j,1) = -ekh(i,j,1)*grav/thvf(1)*local_dthvdz/ ( 2*e120(i,j,1)) + sbdiss(i,j,1) = - (ce1 + ce2*zlt(i,j,1)*deltai(1)) * e120(i,j,1)**2 /(2.*zlt(i,j,1)) + end do + end do + else + do j=2,j1 + do i=2,i1 + tdef2 = 2. * ( & + ((u0(i+1,j,1)-u0(i,j,1)) *dxi )**2 & + + ((v0(i,j+1,1)-v0(i,j,1)) *dyi )**2 & + + ((w0(i,j,2)-w0(i,j,1)) /dzf(1) )**2 ) + + tdef2 = tdef2 + ( 0.25*(w0(i+1,j,2)-w0(i-1,j,2))*dxi + & + dudz(i,j) )**2 + + tdef2 = tdef2 + 0.25 *( & + ((u0(i,j+1,1)-u0(i,j,1))*dyi+(v0(i,j+1,1)-v0(i-1,j+1,1))*dxi)**2 & + +((u0(i,j,1)-u0(i,j-1,1))*dyi+(v0(i,j,1)-v0(i-1,j,1))*dxi)**2 & + +((u0(i+1,j,1)-u0(i+1,j-1,1))*dyi+(v0(i+1,j,1)-v0(i,j,1))*dxi)**2 & + +((u0(i+1,j+1,1)-u0(i+1,j,1))*dyi & + +(v0(i+1,j+1,1)-v0(i,j+1,1))*dxi)**2 ) + + tdef2 = tdef2 + ( 0.25*(w0(i,j+1,2)-w0(i,j-1,2))*dyi + & dvdz(i,j) )**2 - endif -! ** Include shear and buoyancy production terms and dissipation ** + sbshr(i,j,1) = ekm(i,j,1)*tdef2/ ( 2*e120(i,j,1)) + sbbuo(i,j,1) = -ekh(i,j,1)*grav/thvf(1)*dthvdz(i,j,1)/ ( 2*e120(i,j,1)) + sbdiss(i,j,1) = - (ce1 + ce2*zlt(i,j,1)*deltai(1)) * e120(i,j,1)**2 /(2.*zlt(i,j,1)) + end do + end do + endif - sbshr(i,j,1) = ekm(i,j,1)*tdef2/ ( 2*e120(i,j,1)) - if (sgs_surface_fix) then - ! Replace the -ekh * dthvdz by the surface flux of thv - ! (but we only have the thlflux , which seems at the surface to be - ! equivalent - local_dthvdz = -thlflux(i,j)/ekh(i,j,1) - sbbuo(i,j,1) = -ekh(i,j,1)*grav/thvf(1)*local_dthvdz/ ( 2*e120(i,j,1)) - else - sbbuo(i,j,1) = -ekh(i,j,1)*grav/thvf(1)*dthvdz(i,j,1)/ ( 2*e120(i,j,1)) - endif - sbdiss(i,j,1) = - (ce1 + ce2*zlt(i,j,1)*deltai(1)) * e120(i,j,1)**2 /(2.*zlt(i,j,1)) - end do - end do + +! do j=2,j1 +! do i=2,i1 +! ! ** Calculate "shear" production term: tdef2 **************** +! tdef2 = 2. * ( & +! ((u0(i+1,j,1)-u0(i,j,1))*dxi)**2 & +! + ((v0(i,j+1,1)-v0(i,j,1))*dyi)**2 & +! + ((w0(i,j,2)-w0(i,j,1))/dzf(1))**2 ) +! +! if (sgs_surface_fix) then +! ! Use known surface flux and exchange coefficient to derive +! ! consistent gradient (such that correct flux will occur in +! ! shear production term) +! ! Make sure that no division by zero occurs in determination of the +! ! directional component; ekm should already be >= ekmin +! ! Replace the dudz by surface flux -uw / ekm +! horv = max(sqrt((u0(i,j,1)+cu)**2+(v0(i,j,1)+cv)**2), 0.01) +! uwflux = -ustar(i,j)*ustar(i,j)* ((u0(i,j,1)+cu)/horv) +! local_dudz = -uwflux / ekm(i,j,1) +! tdef2 = tdef2 + ( 0.25*(w0(i+1,j,2)-w0(i-1,j,2))*dxi + & +! local_dudz )**2 +! else +! tdef2 = tdef2 + ( 0.25*(w0(i+1,j,2)-w0(i-1,j,2))*dxi + & +! dudz(i,j) )**2 +! endif +! +! tdef2 = tdef2 + 0.25 *( & +! ((u0(i,j+1,1)-u0(i,j,1))*dyi+(v0(i,j+1,1)-v0(i-1,j+1,1))*dxi)**2 & +! +((u0(i,j,1)-u0(i,j-1,1))*dyi+(v0(i,j,1)-v0(i-1,j,1))*dxi)**2 & +! +((u0(i+1,j,1)-u0(i+1,j-1,1))*dyi+(v0(i+1,j,1)-v0(i,j,1))*dxi)**2 & +! +((u0(i+1,j+1,1)-u0(i+1,j,1))*dyi+ & +! (v0(i+1,j+1,1)-v0(i,j+1,1))*dxi)**2 ) +! +! if (sgs_surface_fix) then +! ! Use known surface flux and exchange coefficient to derive +! ! consistent gradient (such that correct flux will occur in +! ! shear production term) +! ! Make sure that no division by zero occurs in determination of the +! ! directional component; ekm should already be >= ekmin +! ! Replace the dvdz by surface flux -vw / ekm +! horv = max(sqrt((u0(i,j,1)+cu)**2+(v0(i,j,1)+cv)**2), 0.01) +! vwflux = -ustar(i,j)*ustar(i,j)* ((v0(i,j,1)+cv)/horv) +! local_dvdz = -vwflux / ekm(i,j,1) +! tdef2 = tdef2 + ( 0.25*(w0(i,j+1,2)-w0(i,j-1,2))*dyi + & +! local_dvdz )**2 +! else +! tdef2 = tdef2 + ( 0.25*(w0(i,j+1,2)-w0(i,j-1,2))*dyi + & +! dvdz(i,j) )**2 +! endif +! +!! ** Include shear and buoyancy production terms and dissipation ** +! +! sbshr(i,j,1) = ekm(i,j,1)*tdef2/ ( 2*e120(i,j,1)) +! if (sgs_surface_fix) then +! ! Replace the -ekh * dthvdz by the surface flux of thv +! ! (but we only have the thlflux , which seems at the surface to be +! ! equivalent +! local_dthvdz = -thlflux(i,j)/ekh(i,j,1) +! sbbuo(i,j,1) = -ekh(i,j,1)*grav/thvf(1)*local_dthvdz/ ( 2*e120(i,j,1)) +! else +! sbbuo(i,j,1) = -ekh(i,j,1)*grav/thvf(1)*dthvdz(i,j,1)/ ( 2*e120(i,j,1)) +! endif +! sbdiss(i,j,1) = - (ce1 + ce2*zlt(i,j,1)*deltai(1)) * e120(i,j,1)**2 /(2.*zlt(i,j,1)) +! end do +! end do e12p(2:i1,2:j1,1) = e12p(2:i1,2:j1,1) + & sbshr(2:i1,2:j1,1)+sbbuo(2:i1,2:j1,1)+sbdiss(2:i1,2:j1,1) @@ -603,14 +667,14 @@ subroutine diffc (a_in,a_out,flux) ( (ekh(i+1,j,k)+ekh(i,j,k))*(a_in(i+1,j,k)-a_in(i,j,k)) & -(ekh(i,j,k)+ekh(i-1,j,k))*(a_in(i,j,k)-a_in(i-1,j,k)))*dx2i * anis_fac(k) & + & - ( (ekh(i,jp,k)+ekh(i,j,k)) *(a_in(i,jp,k)-a_in(i,j,k)) & - -(ekh(i,j,k)+ekh(i,jm,k)) *(a_in(i,j,k)-a_in(i,jm,k)) )*dy2i * anis_fac(k) & + ( (ekh(i,j+1,k)+ekh(i,j,k)) *(a_in(i,j+1,k)-a_in(i,j,k)) & + -(ekh(i,j,k)+ekh(i,j-1,k)) *(a_in(i,j,k)-a_in(i,j-1,k)) )*dy2i * anis_fac(k) & + & - ( rhobh(kp)/rhobf(k) * (dzf(kp)*ekh(i,j,k) + dzf(k)*ekh(i,j,kp)) & - * (a_in(i,j,kp)-a_in(i,j,k)) / dzh(kp)**2 & + ( rhobh(k+1)/rhobf(k) * (dzf(k+1)*ekh(i,j,k) + dzf(k)*ekh(i,j,k+1)) & + * (a_in(i,j,k+1)-a_in(i,j,k)) / dzh(k+1)**2 & - & - rhobh(k)/rhobf(k) * (dzf(km)*ekh(i,j,k) + dzf(k)*ekh(i,j,km)) & - * (a_in(i,j,k)-a_in(i,j,km)) / dzh(k)**2 )/dzf(k) & + rhobh(k)/rhobf(k) * (dzf(k-1)*ekh(i,j,k) + dzf(k)*ekh(i,j,k-1)) & + * (a_in(i,j,k)-a_in(i,j,k-1)) / dzh(k)**2 )/dzf(k) & ) end do @@ -664,13 +728,13 @@ subroutine diffe(a_out) ((ekm(i+1,j,k)+ekm(i,j,k))*(e120(i+1,j,k)-e120(i,j,k)) & -(ekm(i,j,k)+ekm(i-1,j,k))*(e120(i,j,k)-e120(i-1,j,k)))*dx2i * anis_fac(k) & + & - ((ekm(i,jp,k)+ekm(i,j,k)) *(e120(i,jp,k)-e120(i,j,k)) & - -(ekm(i,j,k)+ekm(i,jm,k)) *(e120(i,j,k)-e120(i,jm,k)) )*dy2i * anis_fac(k) & + ((ekm(i,j+1,k)+ekm(i,j,k)) *(e120(i,j+1,k)-e120(i,j,k)) & + -(ekm(i,j,k)+ekm(i,j-1,k)) *(e120(i,j,k)-e120(i,j-1,k)) )*dy2i * anis_fac(k) & + & - (rhobh(kp)/rhobf(k) * (dzf(kp)*ekm(i,j,k) + dzf(k)*ekm(i,j,kp)) & - *(e120(i,j,kp)-e120(i,j,k)) / dzh(kp)**2 & - - rhobh(k)/rhobf(k) * (dzf(km)*ekm(i,j,k) + dzf(k)*ekm(i,j,km)) & - *(e120(i,j,k)-e120(i,j,km)) / dzh(k)**2 )/dzf(k) & + (rhobh(k+1)/rhobf(k) * (dzf(k+1)*ekm(i,j,k) + dzf(k)*ekm(i,j,k+1)) & + *(e120(i,j,k+1)-e120(i,j,k)) / dzh(k+1)**2 & + - rhobh(k)/rhobf(k) * (dzf(k-1)*ekm(i,j,k) + dzf(k)*ekm(i,j,k-1)) & + *(e120(i,j,k)-e120(i,j,k-1)) / dzh(k)**2 )/dzf(k) & ) end do @@ -714,28 +778,22 @@ subroutine diffu (a_out) integer :: i,j,k,jm,jp,km,kp do k=2,kmax - kp=k+1 - km=k-1 - do j=2,j1 - jp=j+1 - jm=j-1 - do i=2,i1 - emom = ( dzf(km) * ( ekm(i,j,k) + ekm(i-1,j,k) ) + & - dzf(k) * ( ekm(i,j,km) + ekm(i-1,j,km) ) ) / & + emom = ( dzf(k-1) * ( ekm(i,j,k) + ekm(i-1,j,k) ) + & + dzf(k) * ( ekm(i,j,k-1) + ekm(i-1,j,k-1) ) ) / & ( 4. * dzh(k) ) - emop = ( dzf(kp) * ( ekm(i,j,k) + ekm(i-1,j,k) ) + & - dzf(k) * ( ekm(i,j,kp) + ekm(i-1,j,kp) ) ) / & - ( 4. * dzh(kp) ) + emop = ( dzf(k+1) * ( ekm(i,j,k) + ekm(i-1,j,k) ) + & + dzf(k) * ( ekm(i,j,k+1) + ekm(i-1,j,k+1) ) ) / & + ( 4. * dzh(k+1) ) empo = 0.25 * ( & - ekm(i,j,k)+ekm(i,jp,k)+ekm(i-1,jp,k)+ekm(i-1,j,k) ) + ekm(i,j,k)+ekm(i,j+1,k)+ekm(i-1,j+1,k)+ekm(i-1,j,k) ) emmo = 0.25 * ( & - ekm(i,j,k)+ekm(i,jm,k)+ekm(i-1,jm,k)+ekm(i-1,j,k) ) + ekm(i,j,k)+ekm(i,j-1,k)+ekm(i-1,j-1,k)+ekm(i-1,j,k) ) a_out(i,j,k) = a_out(i,j,k) & @@ -743,14 +801,14 @@ subroutine diffu (a_out) ( ekm(i,j,k) * (u0(i+1,j,k)-u0(i,j,k)) & -ekm(i-1,j,k)* (u0(i,j,k)-u0(i-1,j,k)) ) * 2. * dx2i * anis_fac(k) & + & - ( empo * ( (u0(i,jp,k)-u0(i,j,k)) *dyi & - +(v0(i,jp,k)-v0(i-1,jp,k))*dxi) & - -emmo * ( (u0(i,j,k)-u0(i,jm,k)) *dyi & + ( empo * ( (u0(i,j+1,k)-u0(i,j,k)) *dyi & + +(v0(i,j+1,k)-v0(i-1,j+1,k))*dxi) & + -emmo * ( (u0(i,j,k)-u0(i,j-1,k)) *dyi & +(v0(i,j,k)-v0(i-1,j,k)) *dxi) ) / dy * anis_fac(k) & + & - ( rhobh(kp)/rhobf(k) * emop * ( (u0(i,j,kp)-u0(i,j,k)) /dzh(kp) & - +(w0(i,j,kp)-w0(i-1,j,kp))*dxi) & - - rhobh(k)/rhobf(k) * emom * ( (u0(i,j,k)-u0(i,j,km)) /dzh(k) & + ( rhobh(k+1)/rhobf(k) * emop * ( (u0(i,j,k+1)-u0(i,j,k)) /dzh(k+1) & + +(w0(i,j,k+1)-w0(i-1,j,k+1))*dxi) & + - rhobh(k)/rhobf(k) * emom * ( (u0(i,j,k)-u0(i,j,k-1)) /dzh(k) & +(w0(i,j,k)-w0(i-1,j,k)) *dxi) ) /dzf(k) end do @@ -762,16 +820,13 @@ subroutine diffu (a_out) ! -------------------------------------------- do j=2,j1 - jp = j+1 - jm = j-1 - do i=2,i1 empo = 0.25 * ( & - ekm(i,j,1)+ekm(i,jp,1)+ekm(i-1,jp,1)+ekm(i-1,j,1) ) + ekm(i,j,1)+ekm(i,j+1,1)+ekm(i-1,j+1,1)+ekm(i-1,j,1) ) emmo = 0.25 * ( & - ekm(i,j,1)+ekm(i,jm,1)+ekm(i-1,jm,1)+ekm(i-1,j,1) ) + ekm(i,j,1)+ekm(i,j-1,1)+ekm(i-1,j-1,1)+ekm(i-1,j,1) ) emop = ( dzf(2) * ( ekm(i,j,1) + ekm(i-1,j,1) ) + & dzf(1) * ( ekm(i,j,2) + ekm(i-1,j,2) ) ) / & @@ -780,25 +835,28 @@ subroutine diffu (a_out) ucu = 0.5*(u0(i,j,1)+u0(i+1,j,1))+cu - if(ucu >= 0.) then - upcu = max(ucu,1.e-10) - else - upcu = min(ucu,-1.e-10) - end if - + !if(ucu >= 0.) then + ! upcu = max(ucu,1.e-10) + !else + ! upcu = min(ucu,-1.e-10) + !end if + + ! Branchless version of the algorithm above, + ! may or may not be more efficient + upcu = sign(1.,ucu) * max(abs(ucu),1.e-10) fu = ( 0.5*( ustar(i,j)+ustar(i-1,j) ) )**2 * & upcu/sqrt(upcu**2 + & - ((v0(i,j,1)+v0(i-1,j,1)+v0(i,jp,1)+v0(i-1,jp,1))/4.+cv)**2) + ((v0(i,j,1)+v0(i-1,j,1)+v0(i,j+1,1)+v0(i-1,j+1,1))/4.+cv)**2) a_out(i,j,1) = a_out(i,j,1) & + & ( ekm(i,j,1) * (u0(i+1,j,1)-u0(i,j,1)) & -ekm(i-1,j,1)* (u0(i,j,1)-u0(i-1,j,1)) ) * 2. * dx2i * anis_fac(k) & + & - ( empo * ( (u0(i,jp,1)-u0(i,j,1)) *dyi & - +(v0(i,jp,1)-v0(i-1,jp,1))*dxi) & - -emmo * ( (u0(i,j,1)-u0(i,jm,1)) *dyi & + ( empo * ( (u0(i,j+1,1)-u0(i,j,1)) *dyi & + +(v0(i,j+1,1)-v0(i-1,j+1,1))*dxi) & + -emmo * ( (u0(i,j,1)-u0(i,j-1,1)) *dyi & +(v0(i,j,1)-v0(i-1,j,1)) *dxi) ) / dy * anis_fac(k) & + & ( rhobh(2)/rhobf(1) * emop * ( (u0(i,j,2)-u0(i,j,1)) /dzh(2) & @@ -825,44 +883,38 @@ subroutine diffv (a_out) integer :: i,j,k,jm,jp,km,kp do k=2,kmax - kp=k+1 - km=k-1 - do j=2,j1 - jp=j+1 - jm=j-1 - do i=2,i1 - eomm = ( dzf(km) * ( ekm(i,j,k) + ekm(i,jm,k) ) + & - dzf(k) * ( ekm(i,j,km) + ekm(i,jm,km) ) ) / & + eomm = ( dzf(k-1) * ( ekm(i,j,k) + ekm(i,j-1,k) ) + & + dzf(k) * ( ekm(i,j,k-1) + ekm(i,j-1,k-1) ) ) / & ( 4. * dzh(k) ) - eomp = ( dzf(kp) * ( ekm(i,j,k) + ekm(i,jm,k) ) + & - dzf(k) * ( ekm(i,j,kp) + ekm(i,jm,kp) ) ) / & - ( 4. * dzh(kp) ) + eomp = ( dzf(k+1) * ( ekm(i,j,k) + ekm(i,j-1,k) ) + & + dzf(k) * ( ekm(i,j,k+1) + ekm(i,j-1,k+1) ) ) / & + ( 4. * dzh(k+1) ) emmo = 0.25 * ( & - ekm(i,j,k)+ekm(i,jm,k)+ekm(i-1,jm,k)+ekm(i-1,j,k) ) + ekm(i,j,k)+ekm(i,j-1,k)+ekm(i-1,j-1,k)+ekm(i-1,j,k) ) epmo = 0.25 * ( & - ekm(i,j,k)+ekm(i,jm,k)+ekm(i+1,jm,k)+ekm(i+1,j,k) ) + ekm(i,j,k)+ekm(i,j-1,k)+ekm(i+1,j-1,k)+ekm(i+1,j,k) ) a_out(i,j,k) = a_out(i,j,k) & + & ( epmo * ( (v0(i+1,j,k)-v0(i,j,k)) *dxi & - +(u0(i+1,j,k)-u0(i+1,jm,k))*dyi) & + +(u0(i+1,j,k)-u0(i+1,j-1,k))*dyi) & -emmo * ( (v0(i,j,k)-v0(i-1,j,k)) *dxi & - +(u0(i,j,k)-u0(i,jm,k)) *dyi) ) / dx * anis_fac(k) & + +(u0(i,j,k)-u0(i,j-1,k)) *dyi) ) / dx * anis_fac(k) & + & - (ekm(i,j,k) * (v0(i,jp,k)-v0(i,j,k)) & - -ekm(i,jm,k)* (v0(i,j,k)-v0(i,jm,k)) ) * 2. * dy2i * anis_fac(k) & + (ekm(i,j,k) * (v0(i,j+1,k)-v0(i,j,k)) & + -ekm(i,j-1,k)* (v0(i,j,k)-v0(i,j-1,k)) ) * 2. * dy2i * anis_fac(k) & + & - ( rhobh(kp)/rhobf(k) * eomp * ( (v0(i,j,kp)-v0(i,j,k)) /dzh(kp) & - +(w0(i,j,kp)-w0(i,jm,kp)) *dyi) & - - rhobh(k)/rhobf(k) * eomm * ( (v0(i,j,k)-v0(i,j,km)) /dzh(k) & - +(w0(i,j,k)-w0(i,jm,k)) *dyi) ) / dzf(k) + ( rhobh(k+1)/rhobf(k) * eomp * ( (v0(i,j,k+1)-v0(i,j,k)) /dzh(k+1) & + +(w0(i,j,k+1)-w0(i,j-1,k+1)) *dyi) & + - rhobh(k)/rhobf(k) * eomm * ( (v0(i,j,k)-v0(i,j,k-1)) /dzh(k) & + +(w0(i,j,k)-w0(i,j-1,k)) *dyi) ) / dzf(k) end do end do @@ -873,44 +925,45 @@ subroutine diffv (a_out) ! -------------------------------------------- do j=2,j1 - jp = j+1 - jm = j-1 do i=2,i1 emmo = 0.25 * ( & - ekm(i,j,1)+ekm(i,jm,1)+ekm(i-1,jm,1)+ekm(i-1,j,1) ) + ekm(i,j,1)+ekm(i,j-1,1)+ekm(i-1,j-1,1)+ekm(i-1,j,1) ) epmo = 0.25 * ( & - ekm(i,j,1)+ekm(i,jm,1)+ekm(i+1,jm,1)+ekm(i+1,j,1) ) + ekm(i,j,1)+ekm(i,j-1,1)+ekm(i+1,j-1,1)+ekm(i+1,j,1) ) - eomp = ( dzf(2) * ( ekm(i,j,1) + ekm(i,jm,1) ) + & - dzf(1) * ( ekm(i,j,2) + ekm(i,jm,2) ) ) / & + eomp = ( dzf(2) * ( ekm(i,j,1) + ekm(i,j-1,1) ) + & + dzf(1) * ( ekm(i,j,2) + ekm(i,j-1,2) ) ) / & ( 4. * dzh(2) ) vcv = 0.5*(v0(i,j,1)+v0(i,j+1,1))+cv - if(vcv >= 0.) then - vpcv = max(vcv,1.e-10) - else - vpcv = min(vcv,-1.e-10) - end if + + !if(vcv >= 0.) then + ! vpcv = max(vcv,1.e-10) + !else + ! vpcv = min(vcv,-1.e-10) + !end if + + vpcv = sign(1., vcv) * max(abs(vcv),1.e-10) fv = ( 0.5*( ustar(i,j)+ustar(i,j-1) ) )**2 * & vpcv/sqrt(vpcv**2 + & - ((u0(i,j,1)+u0(i+1,j,1)+u0(i,jm,1)+u0(i+1,jm,1))/4.+cu)**2) + ((u0(i,j,1)+u0(i+1,j,1)+u0(i,j-1,1)+u0(i+1,j-1,1))/4.+cu)**2) a_out(i,j,1) = a_out(i,j,1) & + & ( epmo * ( (v0(i+1,j,1)-v0(i,j,1)) *dxi & - +(u0(i+1,j,1)-u0(i+1,jm,1))*dyi) & + +(u0(i+1,j,1)-u0(i+1,j-1,1))*dyi) & -emmo * ( (v0(i,j,1)-v0(i-1,j,1)) *dxi & - +(u0(i,j,1)-u0(i,jm,1)) *dyi) ) / dx * anis_fac(k) & + +(u0(i,j,1)-u0(i,j-1,1)) *dyi) ) / dx * anis_fac(k) & + & - ( ekm(i,j,1) * (v0(i,jp,1)-v0(i,j,1)) & - -ekm(i,jm,1)* (v0(i,j,1)-v0(i,jm,1)) ) * 2. * dy2i * anis_fac(k) & + ( ekm(i,j,1) * (v0(i,j+1,1)-v0(i,j,1)) & + -ekm(i,j-1,1)* (v0(i,j,1)-v0(i,j-1,1)) ) * 2. * dy2i * anis_fac(k) & + & ( rhobh(2)/rhobf(1) * eomp * ( (v0(i,j,2)-v0(i,j,1)) /dzh(2) & - +(w0(i,j,2)-w0(i,jm,2)) *dyi) & + +(w0(i,j,2)-w0(i,j-1,2)) *dyi) & -rhobh(1)/rhobf(1)*fv ) / dzf(1) end do @@ -933,44 +986,40 @@ subroutine diffw(a_out) integer :: i,j,k,jm,jp,km,kp do k=2,kmax - kp=k+1 - km=k-1 do j=2,j1 - jp=j+1 - jm=j-1 do i=2,i1 - emom = ( dzf(km) * ( ekm(i,j,k) + ekm(i-1,j,k) ) + & - dzf(k) * ( ekm(i,j,km) + ekm(i-1,j,km) ) ) / & + emom = ( dzf(k-1) * ( ekm(i,j,k) + ekm(i-1,j,k) ) + & + dzf(k) * ( ekm(i,j,k-1) + ekm(i-1,j,k-1) ) ) / & ( 4. * dzh(k) ) - eomm = ( dzf(km) * ( ekm(i,j,k) + ekm(i,jm,k) ) + & - dzf(k) * ( ekm(i,j,km) + ekm(i,jm,km) ) ) / & + eomm = ( dzf(k-1) * ( ekm(i,j,k) + ekm(i,j-1,k) ) + & + dzf(k) * ( ekm(i,j,k-1) + ekm(i,j-1,k-1) ) ) / & ( 4. * dzh(k) ) - eopm = ( dzf(km) * ( ekm(i,j,k) + ekm(i,jp,k) ) + & - dzf(k) * ( ekm(i,j,km) + ekm(i,jp,km) ) ) / & + eopm = ( dzf(k-1) * ( ekm(i,j,k) + ekm(i,j+1,k) ) + & + dzf(k) * ( ekm(i,j,k-1) + ekm(i,j+1,k-1) ) ) / & ( 4. * dzh(k) ) - epom = ( dzf(km) * ( ekm(i,j,k) + ekm(i+1,j,k) ) + & - dzf(k) * ( ekm(i,j,km) + ekm(i+1,j,km) ) ) / & + epom = ( dzf(k-1) * ( ekm(i,j,k) + ekm(i+1,j,k) ) + & + dzf(k) * ( ekm(i,j,k-1) + ekm(i+1,j,k-1) ) ) / & ( 4. * dzh(k) ) a_out(i,j,k) = a_out(i,j,k) & + & ( epom * ( (w0(i+1,j,k)-w0(i,j,k)) *dxi & - +(u0(i+1,j,k)-u0(i+1,j,km)) /dzh(k) ) & + +(u0(i+1,j,k)-u0(i+1,j,k-1)) /dzh(k) ) & -emom * ( (w0(i,j,k)-w0(i-1,j,k)) *dxi & - +(u0(i,j,k)-u0(i,j,km)) /dzh(k) ))/dx * anis_fac(k) & + +(u0(i,j,k)-u0(i,j,k-1)) /dzh(k) ))/dx * anis_fac(k) & + & - ( eopm * ( (w0(i,jp,k)-w0(i,j,k)) *dyi & - +(v0(i,jp,k)-v0(i,jp,km)) /dzh(k) ) & - -eomm * ( (w0(i,j,k)-w0(i,jm,k)) *dyi & - +(v0(i,j,k)-v0(i,j,km)) /dzh(k) ))/dy * anis_fac(k) & + ( eopm * ( (w0(i,j+1,k)-w0(i,j,k)) *dyi & + +(v0(i,j+1,k)-v0(i,j+1,k-1)) /dzh(k) ) & + -eomm * ( (w0(i,j,k)-w0(i,j-1,k)) *dyi & + +(v0(i,j,k)-v0(i,j,k-1)) /dzh(k) ))/dy * anis_fac(k) & + (1./rhobh(k))*& - ( rhobf(k) * ekm(i,j,k) * (w0(i,j,kp)-w0(i,j,k)) /dzf(k) & - - rhobf(km) * ekm(i,j,km)* (w0(i,j,k)-w0(i,j,km)) /dzf(km) ) * 2. & + ( rhobf(k) * ekm(i,j,k) * (w0(i,j,k+1)-w0(i,j,k)) /dzf(k) & + - rhobf(k-1) * ekm(i,j,k-1)* (w0(i,j,k)-w0(i,j,k-1)) /dzf(k-1) ) * 2. & / dzh(k) end do From 135334fbffb7ecfd34090495ac0a8f9258f51421 Mon Sep 17 00:00:00 2001 From: Caspar Jungbacker <caspar.jungbacker@outlook.com> Date: Tue, 13 Jun 2023 09:30:54 +0200 Subject: [PATCH 11/19] Change directives --- src/modmpi.f90 | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/modmpi.f90 b/src/modmpi.f90 index eca1166d5..fe6a1bc09 100644 --- a/src/modmpi.f90 +++ b/src/modmpi.f90 @@ -35,7 +35,7 @@ module modmpi use modmpiinterface -#ifdef _OPENACC +#if defined(_OPENACC) use openacc #endif implicit none @@ -348,7 +348,7 @@ subroutine excjs_real32(a,sx,ex,sy,ey,sz,ez,ih,jh) , sende,recve & , sendw,recvw ! Check if the data is on the gpu - #ifdef _OPENACC + #if defined(_OPENACC) integer :: is_present is_present = acc_is_present(a) #endif @@ -470,7 +470,7 @@ subroutine excjs_real64(a,sx,ex,sy,ey,sz,ez,ih,jh) , sende,recve & , sendw,recvw ! Check if the data is on the gpu - #ifdef _OPENACC + #if defined(_OPENACC) integer :: is_present is_present = acc_is_present(a) #endif From d8bf150c898fbb2d71770d4d76f1b83938881799 Mon Sep 17 00:00:00 2001 From: Caspar Jungbacker <caspar.jungbacker@outlook.com> Date: Tue, 13 Jun 2023 10:01:32 +0200 Subject: [PATCH 12/19] Remove unused loop indices --- src/modsubgrid.f90 | 28 +++++++--------------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/src/modsubgrid.f90 b/src/modsubgrid.f90 index 334541b5e..254388bc0 100644 --- a/src/modsubgrid.f90 +++ b/src/modsubgrid.f90 @@ -440,7 +440,7 @@ subroutine sources implicit none real tdef2, uwflux, vwflux, local_dudz, local_dvdz, local_dthvdz, horv - integer i,j,k,jm,jp,km,kp + integer i,j,k do k=2,kmax do j=2,j1 @@ -651,16 +651,10 @@ subroutine diffc (a_in,a_out,flux) real(field_r), intent(inout) :: a_out(2-ih:i1+ih,2-jh:j1+jh,k1) real, intent(in) :: flux (i2,j2) - integer i,j,k,jm,jp,km,kp - + integer i,j,k + do k=2,kmax - kp=k+1 - km=k-1 - do j=2,j1 - jp=j+1 - jm=j-1 - do i=2,i1 a_out(i,j,k) = a_out(i,j,k) & + 0.5 * ( & @@ -711,16 +705,9 @@ subroutine diffe(a_out) implicit none real(field_r), intent(inout) :: a_out(2-ih:i1+ih,2-jh:j1+jh,k1) - integer :: i,j,k,jm,jp,km,kp - + integer :: i,j,k do k=2,kmax - kp=k+1 - km=k-1 - do j=2,j1 - jp=j+1 - jm=j-1 - do i=2,i1 a_out(i,j,k) = a_out(i,j,k) & @@ -775,7 +762,7 @@ subroutine diffu (a_out) real :: emmo,emom,emop,empo real :: fu real :: ucu, upcu - integer :: i,j,k,jm,jp,km,kp + integer :: i,j,k do k=2,kmax do j=2,j1 @@ -880,7 +867,7 @@ subroutine diffv (a_out) real(field_r), intent(inout) :: a_out(2-ih:i1+ih,2-jh:j1+jh,k1) real :: emmo, eomm,eomp,epmo real :: fv, vcv,vpcv - integer :: i,j,k,jm,jp,km,kp + integer :: i,j,k do k=2,kmax do j=2,j1 @@ -983,8 +970,7 @@ subroutine diffw(a_out) real(field_r), intent(inout) :: a_out(2-ih:i1+ih,2-jh:j1+jh,k1) real :: emom, eomm, eopm, epom - integer :: i,j,k,jm,jp,km,kp - + integer :: i,j,k do k=2,kmax do j=2,j1 do i=2,i1 From dd5a67e32baef6efc9d9980e62923e26fa1c0006 Mon Sep 17 00:00:00 2001 From: Caspar Jungbacker <caspar.jungbacker@outlook.com> Date: Tue, 13 Jun 2023 10:04:36 +0200 Subject: [PATCH 13/19] Update precision.yml --- .github/workflows/precision.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/precision.yml b/.github/workflows/precision.yml index 4a073a8fc..48e3f340f 100644 --- a/.github/workflows/precision.yml +++ b/.github/workflows/precision.yml @@ -4,7 +4,13 @@ name: DALES precision CI on: # Triggers the workflow on push or pull request events but only for the feature/single_precision branch push: + branches: + - master + - OpenACC-Dev pull_request: + branches: + - master + - OpenACC-Dev # Allows you to run this workflow manually from the Actions tab workflow_dispatch: From d70ddc26c8dd0c254a96d5e33f3bd087c856b0ff Mon Sep 17 00:00:00 2001 From: Caspar Jungbacker <caspar.jungbacker@outlook.com> Date: Tue, 13 Jun 2023 10:09:03 +0200 Subject: [PATCH 14/19] Offload loops --- src/modsubgrid.f90 | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/src/modsubgrid.f90 b/src/modsubgrid.f90 index 254388bc0..3aad7b82a 100644 --- a/src/modsubgrid.f90 +++ b/src/modsubgrid.f90 @@ -232,6 +232,7 @@ subroutine closure call timer_tic("Smagorinsky", 1) ! First level mlen = csz(1) * delta(1) + !$acc parallel loop collapse(2) private(strain2) async(1) do i = 2,i1 do j = 2,j1 strain2 = ( & @@ -266,8 +267,11 @@ subroutine closure end do ! Other levels + ! TODO: make this collapsable + !$acc loop private(mlen) do k = 2,kmax mlen = csz(k) * delta(k) + !$acc parallel loop collapse(2) private(strain2) async(1) do i = 2,i1 do j = 2,j1 strain2 = ( & @@ -318,6 +322,7 @@ subroutine closure else ! choose one of ldelta, ldelta+lmason, lanisotropic, or none of them for Deardorff length scale adjustment if (ldelta .and. .not. lmason) then + !$acc parallel loop collapse(3) default(present) do k=1,kmax do j=2,j1 do i=2,i1 @@ -332,6 +337,7 @@ subroutine closure end do end do else if (ldelta .and. lmason) then ! delta scheme with Mason length scale correction + !$acc parallel loop collapse(3) default(present) do k=1,kmax do j=2,j1 do i=2,i1 @@ -347,6 +353,7 @@ subroutine closure end do end do else if (lanisotrop) then ! Anisotropic diffusion, https://doi.org/10.1029/2022MS003095 + !$acc parallel loop collapse(3) default(present) do k=1,kmax do j=2,j1 do i=2,i1 @@ -361,6 +368,7 @@ subroutine closure end do end do else ! Deardorff lengthscale correction + !$acc parallel loop collapse(3) default(present) do k=1,kmax do j=2,j1 do i=2,i1 @@ -398,6 +406,7 @@ subroutine closure call timer_toc("Boundary conditions") call timer_tic("Write buffer", 3) + !$acc parallel loop collapse(2) default(present) do j=1,j2 do i=1,i2 ekm(i,j,k1) = ekm(i,j,kmax) @@ -442,6 +451,7 @@ subroutine sources real tdef2, uwflux, vwflux, local_dudz, local_dvdz, local_dthvdz, horv integer i,j,k + !$acc parallel loop collapse(3) default(present) private(tdef2) do k=2,kmax do j=2,j1 do i=2,i1 @@ -493,6 +503,7 @@ subroutine sources ! -------------------------------------------- if (sgs_surface_fix) then + !$acc parallel loop collapse(2) default(present) private(tdef2,horv,uwflux,vwflux,local_dudz,local_dvdz,local_dthvdz) do j=2,j1 do i=2,i1 tdef2 = 2. * ( & @@ -542,6 +553,7 @@ subroutine sources end do end do else + !$acc parallel loop collapse(2) default(present) private(tdef2) do j=2,j1 do i=2,i1 tdef2 = 2. * ( & @@ -634,9 +646,11 @@ subroutine sources ! sbdiss(i,j,1) = - (ce1 + ce2*zlt(i,j,1)*deltai(1)) * e120(i,j,1)**2 /(2.*zlt(i,j,1)) ! end do ! end do - + + !$acc kernels default(present) e12p(2:i1,2:j1,1) = e12p(2:i1,2:j1,1) + & sbshr(2:i1,2:j1,1)+sbbuo(2:i1,2:j1,1)+sbdiss(2:i1,2:j1,1) + !$acc end kernels return end subroutine sources @@ -653,6 +667,7 @@ subroutine diffc (a_in,a_out,flux) integer i,j,k + !$acc parallel loop collapse(3) default(present) do k=2,kmax do j=2,j1 do i=2,i1 @@ -674,7 +689,8 @@ subroutine diffc (a_in,a_out,flux) end do end do end do - + + !$acc parallel loop collapse(2) default(present) do j=2,j1 do i=2,i1 @@ -706,6 +722,8 @@ subroutine diffe(a_out) real(field_r), intent(inout) :: a_out(2-ih:i1+ih,2-jh:j1+jh,k1) integer :: i,j,k + + !$acc parallel loop collapse(3) default(present) do k=2,kmax do j=2,j1 do i=2,i1 @@ -732,6 +750,7 @@ subroutine diffe(a_out) ! special treatment for lowest full level: k=1 ! -------------------------------------------- + !$acc parallel loop collapse(2) do j=2,j1 do i=2,i1 @@ -764,6 +783,7 @@ subroutine diffu (a_out) real :: ucu, upcu integer :: i,j,k + !$acc parallel loop collapse(3) default(present) private(emom, emop, empo, emmo) do k=2,kmax do j=2,j1 do i=2,i1 @@ -806,6 +826,7 @@ subroutine diffu (a_out) ! special treatment for lowest full level: k=1 ! -------------------------------------------- + !$acc parallel loop collapse(2) default(present) private(empo, emmo, emop, ucu, upcu, fu) do j=2,j1 do i=2,i1 @@ -869,6 +890,7 @@ subroutine diffv (a_out) real :: fv, vcv,vpcv integer :: i,j,k + !$acc parallel loop collapse(3) default(present) private(eomm, eomp, emmo, epmo) do k=2,kmax do j=2,j1 do i=2,i1 @@ -911,6 +933,7 @@ subroutine diffv (a_out) ! special treatment for lowest full level: k=1 ! -------------------------------------------- + !$acc parallel loop collapse(2) default(present) private(emmo, epmo, eomp, vcv, vpcv, fv) do j=2,j1 do i=2,i1 @@ -971,6 +994,8 @@ subroutine diffw(a_out) real(field_r), intent(inout) :: a_out(2-ih:i1+ih,2-jh:j1+jh,k1) real :: emom, eomm, eopm, epom integer :: i,j,k + + !$acc parallel loop collapse(3) default(present) private(emom, eomm, eopm, epom) do k=2,kmax do j=2,j1 do i=2,i1 From 4f68a356d28f0d03e16efb4c278912f176b03bed Mon Sep 17 00:00:00 2001 From: Caspar Jungbacker <caspar.jungbacker@outlook.com> Date: Tue, 13 Jun 2023 14:04:59 +0200 Subject: [PATCH 15/19] Fix anis_fac index --- src/modsubgrid.f90 | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/modsubgrid.f90 b/src/modsubgrid.f90 index 3aad7b82a..23d130671 100644 --- a/src/modsubgrid.f90 +++ b/src/modsubgrid.f90 @@ -697,10 +697,10 @@ subroutine diffc (a_in,a_out,flux) a_out(i,j,1) = a_out(i,j,1) & + 0.5 * ( & ( (ekh(i+1,j,1)+ekh(i,j,1))*(a_in(i+1,j,1)-a_in(i,j,1)) & - -(ekh(i,j,1)+ekh(i-1,j,1))*(a_in(i,j,1)-a_in(i-1,j,1)) )*dx2i * anis_fac(k) & + -(ekh(i,j,1)+ekh(i-1,j,1))*(a_in(i,j,1)-a_in(i-1,j,1)) )*dx2i * anis_fac(1) & + & ( (ekh(i,j+1,1)+ekh(i,j,1))*(a_in(i,j+1,1)-a_in(i,j,1)) & - -(ekh(i,j,1)+ekh(i,j-1,1))*(a_in(i,j,1)-a_in(i,j-1,1)) )*dy2i * anis_fac(k) & + -(ekh(i,j,1)+ekh(i,j-1,1))*(a_in(i,j,1)-a_in(i,j-1,1)) )*dy2i * anis_fac(1) & + & ( rhobh(2)/rhobf(1) * (dzf(2)*ekh(i,j,1) + dzf(1)*ekh(i,j,2)) & * (a_in(i,j,2)-a_in(i,j,1)) / dzh(2)**2 & @@ -756,10 +756,10 @@ subroutine diffe(a_out) a_out(i,j,1) = a_out(i,j,1) + & ( (ekm(i+1,j,1)+ekm(i,j,1))*(e120(i+1,j,1)-e120(i,j,1)) & - -(ekm(i,j,1)+ekm(i-1,j,1))*(e120(i,j,1)-e120(i-1,j,1)) )*dx2i * anis_fac(k) & + -(ekm(i,j,1)+ekm(i-1,j,1))*(e120(i,j,1)-e120(i-1,j,1)) )*dx2i * anis_fac(1) & + & ( (ekm(i,j+1,1)+ekm(i,j,1))*(e120(i,j+1,1)-e120(i,j,1)) & - -(ekm(i,j,1)+ekm(i,j-1,1))*(e120(i,j,1)-e120(i,j-1,1)) )*dy2i * anis_fac(k) & + -(ekm(i,j,1)+ekm(i,j-1,1))*(e120(i,j,1)-e120(i,j-1,1)) )*dy2i * anis_fac(1) & + & ( rhobh(2)/rhobf(1) * (dzf(2)*ekm(i,j,1) + dzf(1)*ekm(i,j,2)) & * (e120(i,j,2)-e120(i,j,1)) / dzh(2)**2 )/dzf(1) @@ -860,12 +860,12 @@ subroutine diffu (a_out) a_out(i,j,1) = a_out(i,j,1) & + & ( ekm(i,j,1) * (u0(i+1,j,1)-u0(i,j,1)) & - -ekm(i-1,j,1)* (u0(i,j,1)-u0(i-1,j,1)) ) * 2. * dx2i * anis_fac(k) & + -ekm(i-1,j,1)* (u0(i,j,1)-u0(i-1,j,1)) ) * 2. * dx2i * anis_fac(1) & + & ( empo * ( (u0(i,j+1,1)-u0(i,j,1)) *dyi & +(v0(i,j+1,1)-v0(i-1,j+1,1))*dxi) & -emmo * ( (u0(i,j,1)-u0(i,j-1,1)) *dyi & - +(v0(i,j,1)-v0(i-1,j,1)) *dxi) ) / dy * anis_fac(k) & + +(v0(i,j,1)-v0(i-1,j,1)) *dxi) ) / dy * anis_fac(1) & + & ( rhobh(2)/rhobf(1) * emop * ( (u0(i,j,2)-u0(i,j,1)) /dzh(2) & +(w0(i,j,2)-w0(i-1,j,2)) *dxi) & @@ -967,10 +967,10 @@ subroutine diffv (a_out) ( epmo * ( (v0(i+1,j,1)-v0(i,j,1)) *dxi & +(u0(i+1,j,1)-u0(i+1,j-1,1))*dyi) & -emmo * ( (v0(i,j,1)-v0(i-1,j,1)) *dxi & - +(u0(i,j,1)-u0(i,j-1,1)) *dyi) ) / dx * anis_fac(k) & + +(u0(i,j,1)-u0(i,j-1,1)) *dyi) ) / dx * anis_fac(1) & + & ( ekm(i,j,1) * (v0(i,j+1,1)-v0(i,j,1)) & - -ekm(i,j-1,1)* (v0(i,j,1)-v0(i,j-1,1)) ) * 2. * dy2i * anis_fac(k) & + -ekm(i,j-1,1)* (v0(i,j,1)-v0(i,j-1,1)) ) * 2. * dy2i * anis_fac(1) & + & ( rhobh(2)/rhobf(1) * eomp * ( (v0(i,j,2)-v0(i,j,1)) /dzh(2) & +(w0(i,j,2)-w0(i,j-1,2)) *dyi) & From be059b5f506a302150acbd4c58363af67d9b05d7 Mon Sep 17 00:00:00 2001 From: Caspar Jungbacker <caspar.jungbacker@outlook.com> Date: Tue, 13 Jun 2023 14:50:18 +0200 Subject: [PATCH 16/19] Add/update data clauses --- src/modadvection.f90 | 3 +-- src/modsubgrid.f90 | 26 +++++++++++++++++++++----- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/src/modadvection.f90 b/src/modadvection.f90 index 573247b7b..058f44c95 100644 --- a/src/modadvection.f90 +++ b/src/modadvection.f90 @@ -47,7 +47,7 @@ subroutine advection ! leq = .false. ! for testing that the non-uniform advection routines agree with the uniform ones ! when the grid is uniform - !$acc data copyin(u0, v0, w0, e120, thl0, qt0, dzf, dzh, rhobf, rhobh) copy(up, vp, wp, e12p, thlp, qtp) + !$acc enter data copyin(u0, v0, w0, e120, thl0, qt0, dzf, dzh, rhobf, rhobh, up, vp, wp, e12p, thlp, qtp) select case(iadv_mom) case(iadv_cd2) call advecu_2nd(u0,up) @@ -212,6 +212,5 @@ subroutine advection end select end do !$acc wait - !$acc end data end subroutine advection end module modadvection diff --git a/src/modsubgrid.f90 b/src/modsubgrid.f90 index 23d130671..acc5b2fac 100644 --- a/src/modsubgrid.f90 +++ b/src/modsubgrid.f90 @@ -114,7 +114,8 @@ subroutine initsubgrid write (6,*) 'cs = ',cs write (6,*) 'Rigc = ',Rigc endif - + + !$acc enter data create(anis_fac) end subroutine initsubgrid subroutine subgridnamelist @@ -158,15 +159,24 @@ subroutine subgrid ! Diffusion subroutines ! Thijs Heus, Chiel van Heerwaarden, 15 June 2007 - use modglobal, only : nsv, lmoist - use modfields, only : up,vp,wp,e12p,thl0,thlp,qt0,qtp,sv0,svp - use modsurfdata,only : thlflux,qtflux,svflux + use modglobal, only : nsv, lmoist, deltai, delta, dzf, dzh + use modfields, only : up,vp,wp,e12p,thl0,thlp,qt0,qtp,sv0,svp,u0,v0,w0,rhobh,rhobf,e120,dthvdz, thvf + use modsurfdata,only : thlflux,qtflux,svflux,ustar,dudz,dvdz + implicit none integer n + + ! Already on GPU: u0, v0, w0, 120, thl0, qt0, dzf, dzh, rhobf, rhobh, + ! up, vp, wp, e12p, thlp, qtp + !$acc enter data copyin(thvf, thlflux, dthvdz, qtflux, delta, deltai, ustar) + !$acc enter data copyin(dudz, dvdz) + !$acc enter data copyin(ekm, ekh, zlt, sbdiss, sbbuo, sbshr, csz) + call timer_tic("closure", 0) call closure call timer_toc("closure") + !$acc wait call diffu(up) call diffv(vp) call diffw(wp) @@ -177,11 +187,17 @@ subroutine subgrid call diffc(sv0(:,:,:,n),svp(:,:,:,n),svflux(:,:,n)) end do if (.not. lsmagorinsky) call sources + + !$acc exit data copyout(up, vp, wp) + !$acc exit data copyout(e12p, thlp, qtp) + !$acc exit data copyout(ekm, ekh, zlt) + !$acc exit data delete(u0, v0, w0, e120, thl0, qt0, dzf, dzh, rhobf, rhobh) + !$acc exit data delete(csz) end subroutine subroutine exitsubgrid implicit none - deallocate(ekm,ekh,zlt,sbdiss,sbbuo,sbshr,csz) + deallocate(ekm,ekh,zlt,sbdiss,sbbuo,sbshr,csz,anis_fac) end subroutine exitsubgrid subroutine closure From c270f403f7b3001f7300cccf6f5b5f7e91664b79 Mon Sep 17 00:00:00 2001 From: Caspar Jungbacker <caspar.jungbacker@outlook.com> Date: Tue, 13 Jun 2023 15:26:03 +0200 Subject: [PATCH 17/19] Update exchange kernels --- src/modmpi.f90 | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/modmpi.f90 b/src/modmpi.f90 index fe6a1bc09..986cdf0a6 100644 --- a/src/modmpi.f90 +++ b/src/modmpi.f90 @@ -335,6 +335,9 @@ subroutine exitmpi end subroutine exitmpi subroutine excjs_real32(a,sx,ex,sy,ey,sz,ez,ih,jh) + #if defined(_OPENACC) + use openacc + #endif implicit none integer sx, ex, sy, ey, sz, ez, ih, jh real(real32) a(sx-ih:ex+ih, sy-jh:ey+jh, sz:ez) @@ -354,7 +357,7 @@ subroutine excjs_real32(a,sx,ex,sy,ey,sz,ez,ih,jh) #endif ! Calulate buffer lengths - !$acc kernels if(is_present) async + !$acc kernels default(present) if(is_present) xl = size(a,1) yl = size(a,2) zl = size(a,3) @@ -392,7 +395,7 @@ subroutine excjs_real32(a,sx,ex,sy,ey,sz,ez,ih,jh) else ! Single processor, make sure the field is periodic - !$acc kernels default(present) + !$acc kernels default(present) if(is_present) a(:,sy-jh:sy-1,:) = a(:,ey-jh+1:ey,:) a(:,ey+1:ey+jh,:) = a(:,sy:sy+jh-1,:) !$acc end kernels @@ -426,7 +429,7 @@ subroutine excjs_real32(a,sx,ex,sy,ey,sz,ez,ih,jh) else ! Single processor, make sure the field is periodic - !$acc kernels default(present) + !$acc kernels default(present) if(is_present) a(sx-ih:sx-1,:,:) = a(ex-ih+1:ex,:,:) a(ex+1:ex+ih,:,:) = a(sx:sx+ih-1,:,:) !$acc end kernels @@ -457,6 +460,9 @@ subroutine excjs_real32(a,sx,ex,sy,ey,sz,ez,ih,jh) end subroutine excjs_real32 subroutine excjs_real64(a,sx,ex,sy,ey,sz,ez,ih,jh) + #if defined(_OPENACC) + use openacc + #endif implicit none integer sx, ex, sy, ey, sz, ez, ih, jh real(real64) a(sx-ih:ex+ih, sy-jh:ey+jh, sz:ez) @@ -476,7 +482,7 @@ subroutine excjs_real64(a,sx,ex,sy,ey,sz,ez,ih,jh) #endif ! Calulate buffer lengths - !$acc kernels default(present) if(is_present) async(1) + !$acc kernels default(present) if(is_present) xl = size(a,1) yl = size(a,2) zl = size(a,3) @@ -515,7 +521,7 @@ subroutine excjs_real64(a,sx,ex,sy,ey,sz,ez,ih,jh) else ! Single processor, make sure the field is periodic - !$acc kernels default(present) if(is_present) async(1) + !$acc kernels default(present) if(is_present) a(:,sy-jh:sy-1,:) = a(:,ey-jh+1:ey,:) a(:,ey+1:ey+jh,:) = a(:,sy:sy+jh-1,:) !$acc end kernels @@ -549,7 +555,7 @@ subroutine excjs_real64(a,sx,ex,sy,ey,sz,ez,ih,jh) else ! Single processor, make sure the field is periodic - !$acc kernels default(present) if(is_present) async(1) + !$acc kernels default(present) if(is_present) a(sx-ih:sx-1,:,:) = a(ex-ih+1:ex,:,:) a(ex+1:ex+ih,:,:) = a(sx:sx+ih-1,:,:) !$acc end kernels From a0652712566e48e3b351881ced46c4db78c684dc Mon Sep 17 00:00:00 2001 From: Caspar Jungbacker <caspar.jungbacker@outlook.com> Date: Tue, 13 Jun 2023 15:30:59 +0200 Subject: [PATCH 18/19] Fix preprocessor stuff --- src/modmpi.f90 | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/modmpi.f90 b/src/modmpi.f90 index 986cdf0a6..74193329f 100644 --- a/src/modmpi.f90 +++ b/src/modmpi.f90 @@ -335,9 +335,9 @@ subroutine exitmpi end subroutine exitmpi subroutine excjs_real32(a,sx,ex,sy,ey,sz,ez,ih,jh) - #if defined(_OPENACC) +#if defined(_OPENACC) use openacc - #endif +#endif implicit none integer sx, ex, sy, ey, sz, ez, ih, jh real(real32) a(sx-ih:ex+ih, sy-jh:ey+jh, sz:ez) @@ -351,10 +351,10 @@ subroutine excjs_real32(a,sx,ex,sy,ey,sz,ez,ih,jh) , sende,recve & , sendw,recvw ! Check if the data is on the gpu - #if defined(_OPENACC) +#if defined(_OPENACC) integer :: is_present is_present = acc_is_present(a) - #endif +#endif ! Calulate buffer lengths !$acc kernels default(present) if(is_present) @@ -460,9 +460,9 @@ subroutine excjs_real32(a,sx,ex,sy,ey,sz,ez,ih,jh) end subroutine excjs_real32 subroutine excjs_real64(a,sx,ex,sy,ey,sz,ez,ih,jh) - #if defined(_OPENACC) +#if defined(_OPENACC) use openacc - #endif +#endif implicit none integer sx, ex, sy, ey, sz, ez, ih, jh real(real64) a(sx-ih:ex+ih, sy-jh:ey+jh, sz:ez) @@ -476,10 +476,10 @@ subroutine excjs_real64(a,sx,ex,sy,ey,sz,ez,ih,jh) , sende,recve & , sendw,recvw ! Check if the data is on the gpu - #if defined(_OPENACC) +#if defined(_OPENACC) integer :: is_present is_present = acc_is_present(a) - #endif +#endif ! Calulate buffer lengths !$acc kernels default(present) if(is_present) From d41510a1eb4d6d0eebd86bac4c6608ea6c07a6de Mon Sep 17 00:00:00 2001 From: Caspar Jungbacker <caspar.jungbacker@outlook.com> Date: Tue, 13 Jun 2023 15:36:47 +0200 Subject: [PATCH 19/19] Change data type --- src/modmpi.f90 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/modmpi.f90 b/src/modmpi.f90 index 74193329f..c4347a180 100644 --- a/src/modmpi.f90 +++ b/src/modmpi.f90 @@ -352,7 +352,7 @@ subroutine excjs_real32(a,sx,ex,sy,ey,sz,ez,ih,jh) , sendw,recvw ! Check if the data is on the gpu #if defined(_OPENACC) - integer :: is_present + logical :: is_present is_present = acc_is_present(a) #endif @@ -477,7 +477,7 @@ subroutine excjs_real64(a,sx,ex,sy,ey,sz,ez,ih,jh) , sendw,recvw ! Check if the data is on the gpu #if defined(_OPENACC) - integer :: is_present + logical :: is_present is_present = acc_is_present(a) #endif