diff --git a/.github/workflows/precision.yml b/.github/workflows/precision.yml index 4a073a8f..48e3f340 100644 --- a/.github/workflows/precision.yml +++ b/.github/workflows/precision.yml @@ -4,7 +4,13 @@ name: DALES precision CI on: # Triggers the workflow on push or pull request events but only for the feature/single_precision branch push: + branches: + - master + - OpenACC-Dev pull_request: + branches: + - master + - OpenACC-Dev # Allows you to run this workflow manually from the Actions tab workflow_dispatch: diff --git a/src/modadvection.f90 b/src/modadvection.f90 index 573247b7..058f44c9 100644 --- a/src/modadvection.f90 +++ b/src/modadvection.f90 @@ -47,7 +47,7 @@ subroutine advection ! leq = .false. ! for testing that the non-uniform advection routines agree with the uniform ones ! when the grid is uniform - !$acc data copyin(u0, v0, w0, e120, thl0, qt0, dzf, dzh, rhobf, rhobh) copy(up, vp, wp, e12p, thlp, qtp) + !$acc enter data copyin(u0, v0, w0, e120, thl0, qt0, dzf, dzh, rhobf, rhobh, up, vp, wp, e12p, thlp, qtp) select case(iadv_mom) case(iadv_cd2) call advecu_2nd(u0,up) @@ -212,6 +212,5 @@ subroutine advection end select end do !$acc wait - !$acc end data end subroutine advection end module modadvection diff --git a/src/modbudget.f90 b/src/modbudget.f90 index b815ea24..1a0fa194 100644 --- a/src/modbudget.f90 +++ b/src/modbudget.f90 @@ -213,7 +213,7 @@ subroutine do_genbudget ijtot,cu,cv,grav, & dxi,dyi,dx2i,dy2i use modsurfdata,only : ustar - use modsubgriddata, only : ekm + use modsubgriddata, only : ekm,anis_fac use modpois, only : p use modfields, only : u0,v0,w0,thv0h,u0av,v0av,rhobf,rhobh,thvh !cstep use modtilt, only : adjustbudget,ltilted @@ -407,12 +407,12 @@ subroutine do_genbudget tau1j(i,j,k) = & ( ekm(i,j,k) * (u0(i+1,j,k)-u0(i,j,k)) & - -ekm(i-1,j,k)* (u0(i,j,k)-u0(i-1,j,k)) ) * 2. * dx2i & + -ekm(i-1,j,k)* (u0(i,j,k)-u0(i-1,j,k)) ) * 2. * dx2i * anis_fac(k) & + & ( empo * ( (u0(i,jp,k)-u0(i,j,k)) *dyi & +(v0(i,jp,k)-v0(i-1,jp,k))*dxi) & -emmo * ( (u0(i,j,k)-u0(i,jm,k)) *dyi & - +(v0(i,j,k)-v0(i-1,j,k)) *dxi) ) * dyi & + +(v0(i,j,k)-v0(i-1,j,k)) *dxi) ) * dyi * anis_fac(k) & + & ( rhobh(kp)/rhobf(k) * emop * ( (u0(i,j,kp)-u0(i,j,k)) /dzh(kp) & +(w0(i,j,kp)-w0(i-1,j,kp))*dxi) & @@ -455,10 +455,10 @@ subroutine do_genbudget ( epmo * ( (v0(i+1,j,k)-v0(i,j,k)) *dxi & +(u0(i+1,j,k)-u0(i+1,jm,k))*dyi) & -emmo * ( (v0(i,j,k)-v0(i-1,j,k)) *dxi & - +(u0(i,j,k)-u0(i,jm,k)) *dyi) ) * dxi & + +(u0(i,j,k)-u0(i,jm,k)) *dyi) ) * dxi * anis_fac(k) & + & ( ekm(i,j,k) * (v0(i,jp,k)-v0(i,j,k)) & - -ekm(i,jm,k)* (v0(i,j,k)-v0(i,jm,k)) ) * 2. * dy2i & + -ekm(i,jm,k)* (v0(i,j,k)-v0(i,jm,k)) ) * 2. * dy2i * anis_fac(k) & + & ( rhobh(kp)/rhobf(k) * eomp * ( (v0(i,j,kp)-v0(i,j,k)) /dzh(kp) & +(w0(i,j,kp)-w0(i,jm,kp)) *dyi) & @@ -501,12 +501,12 @@ subroutine do_genbudget ( epom * ( (w0(i+1,j,k)-w0(i,j,k)) *dxi & +(u0(i+1,j,k)-u0(i+1,j,km)) /dzh(k) ) & -emom * ( (w0(i,j,k)-w0(i-1,j,k)) *dxi & - +(u0(i,j,k)-u0(i,j,km)) /dzh(k) ))*dxi & + +(u0(i,j,k)-u0(i,j,km)) /dzh(k) ))*dxi * anis_fac(k) & + & ( eopm * ( (w0(i,jp,k)-w0(i,j,k)) *dyi & +(v0(i,jp,k)-v0(i,jp,km)) /dzh(k) ) & -eomm * ( (w0(i,j,k)-w0(i,jm,k)) *dyi & - +(v0(i,j,k)-v0(i,j,km)) /dzh(k) ))*dyi & + +(v0(i,j,k)-v0(i,j,km)) /dzh(k) ))*dyi * anis_fac(k) & + & ( rhobf(k)/rhobh(k) * ekm(i,j,k) * (w0(i,j,kp)-w0(i,j,k)) /dzf(k) & - rhobf(km)/rhobh(k) * ekm(i,j,km)* (w0(i,j,k)-w0(i,j,km)) /dzf(km) ) * 2. & @@ -553,12 +553,12 @@ subroutine do_genbudget tau1j(i,j,1) = & ( ekm(i,j,1) * (u0(i+1,j,1)-u0(i,j,1)) & - -ekm(i-1,j,1)* (u0(i,j,1)-u0(i-1,j,1)) ) * 2. * dx2i & + -ekm(i-1,j,1)* (u0(i,j,1)-u0(i-1,j,1)) ) * 2. * dx2i * anis_fac(k) & + & ( empo * ( (u0(i,jp,1)-u0(i,j,1)) *dyi & +(v0(i,jp,1)-v0(i-1,jp,1))*dxi) & - emmo * ( (u0(i,j,1)-u0(i,jm,1)) *dyi & - +(v0(i,j,1)-v0(i-1,j,1)) *dxi) ) * dyi & + +(v0(i,j,1)-v0(i-1,j,1)) *dxi) ) * dyi * anis_fac(k) & + & ( rhobh(2)/rhobf(1) * emop * ( (u0(i,j,2)-u0(i,j,1)) /dzh(2) & +(w0(i,j,2)-w0(i-1,j,2)) *dxi) & @@ -584,10 +584,10 @@ subroutine do_genbudget ( epmo * ( (v0(i+1,j,1)-v0(i,j,1)) *dxi & +(u0(i+1,j,1)-u0(i+1,jm,1))*dyi) & -emmo * ( (v0(i,j,1)-v0(i-1,j,1)) *dxi & - +(u0(i,j,1)-u0(i,jm,1)) *dyi) ) * dxi & + +(u0(i,j,1)-u0(i,jm,1)) *dyi) ) * dxi * anis_fac(k) & + & ( ekm(i,j,1) * (v0(i,jp,1)-v0(i,j,1)) & - -ekm(i,jm,1)* (v0(i,j,1)-v0(i,jm,1)) ) * 2. * dy2i & + -ekm(i,jm,1)* (v0(i,j,1)-v0(i,jm,1)) ) * 2. * dy2i * anis_fac(k) & + & ( rhobh(2)/rhobf(1) * eomp * ( (v0(i,j,2)-v0(i,j,1)) /dzh(2) & +(w0(i,j,2)-w0(i,jm,2)) *dyi) & diff --git a/src/modforces.f90 b/src/modforces.f90 index 93d33f4e..09bbe322 100644 --- a/src/modforces.f90 +++ b/src/modforces.f90 @@ -63,69 +63,36 @@ subroutine forces use modglobal, only : i1,j1,kmax,dzh,dzf,grav, lpressgrad use modfields, only : sv0,up,vp,wp,thv0h,dpdxl,dpdyl,thvh use moduser, only : force_user - use modmicrodata, only : imicro, imicro_bulk, imicro_bin, imicro_sice,iqr + use modmicrodata, only : imicro, imicro_bulk, imicro_bin, imicro_sice, imicro_sice2, iqr implicit none - integer i, j, k, jm, jp, km, kp + integer i, j, k if (lforce_user) call force_user - if((imicro==imicro_sice).or.(imicro==imicro_bulk).or.(imicro==imicro_bin)) then + if (lpressgrad) then + do k=1,kmax + up(:,:,k) = up(:,:,k) - dpdxl(k) !RN LS pressure gradient force in x,y directions; + vp(:,:,k) = vp(:,:,k) - dpdyl(k) + end do + end if + + if((imicro==imicro_sice).or.(imicro==imicro_sice2).or.(imicro==imicro_bulk).or.(imicro==imicro_bin)) then do k=2,kmax - kp=k+1 - km=k-1 - do j=2,j1 - jp=j+1 - jm=j-1 - do i=2,i1 - - if (lpressgrad) then - up(i,j,k) = up(i,j,k) - dpdxl(k) !RN LS pressure gradient force in x,y directions; - vp(i,j,k) = vp(i,j,k) - dpdyl(k) - end if - wp(i,j,k) = wp(i,j,k) + grav*(thv0h(i,j,k)-thvh(k))/thvh(k) - & - grav*(sv0(i,j,k,iqr)*dzf(k-1)+sv0(i,j,k-1,iqr)*dzf(k))/(2.0*dzh(k)) - end do - end do + wp(:,:,k) = wp(:,:,k) + grav*(thv0h(:,:,k)-thvh(k))/thvh(k) - & + grav*(sv0(:,:,k,iqr)*dzf(k-1)+sv0(:,:,k-1,iqr)*dzf(k))/(2.0*dzh(k)) end do else do k=2,kmax - kp=k+1 - km=k-1 - do j=2,j1 - jp=j+1 - jm=j-1 - do i=2,i1 - up(i,j,k) = up(i,j,k) - dpdxl(k) - vp(i,j,k) = vp(i,j,k) - dpdyl(k) - wp(i,j,k) = wp(i,j,k) + grav*(thv0h(i,j,k)-thvh(k))/thvh(k) - end do - end do + wp(:,:,k) = wp(:,:,k) + grav*(thv0h(:,:,k)-thvh(k))/thvh(k) end do end if ! -------------------------------------------- ! special treatment for lowest full level: k=1 ! -------------------------------------------- + wp(:,:,1) = 0.0 - do j=2,j1 - jp = j+1 - jm = j-1 - do i=2,i1 - - if (lpressgrad) then - up(i,j,1) = up(i,j,1) - dpdxl(1) - vp(i,j,1) = vp(i,j,1) - dpdyl(1) - end if - - wp(i,j,1) = 0.0 - - end do - end do -! ----------------------------------------------end i,j-loop - - - return end subroutine forces subroutine coriolis @@ -152,7 +119,7 @@ subroutine coriolis integer i, j, k, jm, jp, km, kp if (lcoriol .eqv. .false.) return - + do k=2,kmax kp=k+1 km=k-1 @@ -232,80 +199,45 @@ subroutine lstend dqtdtls, dthldtls, dudtls, dvdtls implicit none - integer i,j,k,n,kp,km - real subs_thl,subs_qt,subs_sv,subs_u,subs_v + integer k,kp,km ! 1. DETERMINE LARGE SCALE TENDENCIES ! -------------------------------- ! 1.1 lowest model level above surface : only downward component - - subs_thl = 0. - subs_qt = 0. - subs_sv = 0. - subs_u = 0. - subs_v = 0. - - do j=2,j1 - do i=2,i1 - k = 1 - if (whls(2).lt.0) then !neglect effect of mean ascending on tendencies at the lowest full level - subs_thl = 0.5*whls(2) *(thl0(i,j,2)-thl0(i,j,1))/dzh(2) - subs_qt = 0.5*whls(2) *(qt0(i,j,2)-qt0(i,j,1) )/dzh(2) - if (lmomsubs) then - subs_u = 0.5*whls(2) *(u0(i,j,2)-u0(i,j,1))/dzh(2) - subs_v = 0.5*whls(2) *(v0(i,j,2)-v0(i,j,1))/dzh(2) - endif - do n=1,nsv - subs_sv = 0.5*whls(2) *(sv0(i,j,2,n)-sv0(i,j,1,n) )/dzh(2) - svp(i,j,1,n) = svp(i,j,1,n)-subs_sv - enddo - endif - thlp(i,j,1) = thlp(i,j,1) -u0av(1)*dthldxls(1)-v0av(1)*dthldyls(1)-subs_thl + dthldtls(1) - qtp(i,j,1) = qtp (i,j,1) -u0av(1)*dqtdxls (1)-v0av(1)*dqtdyls (1)-subs_qt + dqtdtls(1) - up (i,j,1) = up (i,j,1) -u0av(1)*dudxls (1)-v0av(1)*dudyls (1)-subs_u + dudtls(1) - vp (i,j,1) = vp (i,j,1) -u0av(1)*dvdxls (1)-v0av(1)*dvdyls (1)-subs_v + dvdtls(1) - end do - end do - ! 1.2 other model levels twostream - do k=2,kmax + do k=1,kmax kp=k+1 km=k-1 - do j=2,j1 - do i=2,i1 - if (whls(kp).lt.0) then !downwind scheme for subsidence - subs_thl = whls(kp) * (thl0(i,j,kp) - thl0(i,j,k))/dzh(kp) - subs_qt = whls(kp) * (qt0 (i,j,kp) - qt0 (i,j,k))/dzh(kp) - if (lmomsubs) then - subs_u = whls(kp) * (u0(i,j,kp) - u0(i,j,k))/dzh(kp) - subs_v = whls(kp) * (v0(i,j,kp) - v0(i,j,k))/dzh(kp) - endif - do n=1,nsv - subs_sv = whls(kp) *(sv0(i,j,kp,n) - sv0(i,j,k,n))/dzh(kp) - svp(i,j,k,n) = svp(i,j,k,n)-subs_sv - enddo - else !downwind scheme for mean upward motions - subs_thl = whls(k) * (thl0(i,j,k) - thl0(i,j,km))/dzh(k) - subs_qt = whls(k) * (qt0 (i,j,k) - qt0 (i,j,km))/dzh(k) + + if (whls(kp).lt.0) then !downwind scheme for subsidence + thlp(2:i1,2:j1,k) = thlp(2:i1,2:j1,k) - whls(kp) * (thl0(2:i1,2:j1,kp) - thl0(2:i1,2:j1,k))/dzh(kp) + qtp (2:i1,2:j1,k) = qtp (2:i1,2:j1,k) - whls(kp) * (qt0 (2:i1,2:j1,kp) - qt0 (2:i1,2:j1,k))/dzh(kp) + if (lmomsubs) then + up(2:i1,2:j1,k) = up(2:i1,2:j1,k) - whls(kp) * (u0(2:i1,2:j1,kp) - u0(2:i1,2:j1,k))/dzh(kp) + vp(2:i1,2:j1,k) = vp(2:i1,2:j1,k) - whls(kp) * (v0(2:i1,2:j1,kp) - v0(2:i1,2:j1,k))/dzh(kp) + endif + + svp(2:i1,2:j1,k,:) = svp(2:i1,2:j1,k,:) - whls(kp) * (sv0(2:i1,2:j1,kp,:) - sv0(2:i1,2:j1,k,:))/dzh(kp) + + else !downwind scheme for mean upward motions + if (k > 1) then !neglect effect of mean ascending on tendencies at the lowest full level + thlp(2:i1,2:j1,k) = thlp(2:i1,2:j1,k) - whls(k) * (thl0(2:i1,2:j1,k) - thl0(2:i1,2:j1,km))/dzh(k) + qtp (2:i1,2:j1,k) = qtp (2:i1,2:j1,k) - whls(k) * (qt0 (2:i1,2:j1,k) - qt0 (2:i1,2:j1,km))/dzh(k) if (lmomsubs) then - subs_u = whls(k) * (u0(i,j,k) - u0(i,j,km))/dzh(k) - subs_v = whls(k) * (v0(i,j,k) - v0(i,j,km))/dzh(k) + up(2:i1,2:j1,k) = up(2:i1,2:j1,k) - whls(k) * (u0(2:i1,2:j1,k) - u0(2:i1,2:j1,km))/dzh(k) + vp(2:i1,2:j1,k) = vp(2:i1,2:j1,k) - whls(k) * (v0(2:i1,2:j1,k) - v0(2:i1,2:j1,km))/dzh(k) endif - do n=1,nsv - subs_sv = whls(k) * (sv0(i,j,k,n) - sv0(i,j,km,n))/dzh(k) - svp(i,j,k,n) = svp(i,j,k,n)-subs_sv - enddo - endif - - thlp(i,j,k) = thlp(i,j,k)-u0av(k)*dthldxls(k)-v0av(k)*dthldyls(k)-subs_thl + dthldtls(k) - qtp (i,j,k) = qtp (i,j,k)-u0av(k)*dqtdxls (k)-v0av(k)*dqtdyls (k)-subs_qt + dqtdtls(k) - up (i,j,k) = up (i,j,k)-u0av(k)*dudxls (k)-v0av(k)*dudyls (k)-subs_u + dudtls(k) - vp (i,j,k) = vp (i,j,k)-u0av(k)*dvdxls (k)-v0av(k)*dvdyls (k)-subs_v + dvdtls(k) - - enddo - enddo + svp(2:i1,2:j1,k,:) = svp(2:i1,2:j1,k,:)-whls(k) * (sv0(2:i1,2:j1,k,:) - sv0(2:i1,2:j1,km,:))/dzh(k) + endif + endif + + thlp(2:i1,2:j1,k) = thlp(2:i1,2:j1,k)-u0av(k)*dthldxls(k)-v0av(k)*dthldyls(k) + dthldtls(k) + qtp (2:i1,2:j1,k) = qtp (2:i1,2:j1,k)-u0av(k)*dqtdxls (k)-v0av(k)*dqtdyls (k) + dqtdtls(k) + up (2:i1,2:j1,k) = up (2:i1,2:j1,k)-u0av(k)*dudxls (k)-v0av(k)*dudyls (k) + dudtls(k) + vp (2:i1,2:j1,k) = vp (2:i1,2:j1,k)-u0av(k)*dvdxls (k)-v0av(k)*dvdyls (k) + dvdtls(k) + enddo return diff --git a/src/modglobal.f90 b/src/modglobal.f90 index e550f64c..8c594f11 100644 --- a/src/modglobal.f90 +++ b/src/modglobal.f90 @@ -216,7 +216,7 @@ module modglobal real :: xsize = -1 !< domain size in x-direction real :: ysize = -1 !< domain size in y-direction real, allocatable :: delta(:) !< (dx*dy*dz)**(1/3) - real, allocatable :: deltai(:) !< (dx*dy*dz)**(-1/3) + real, allocatable :: deltai(:) !< (dx*dy*dz)**(-1/3) or dzf**-1 for anisotropic diffusion logical :: leq = .true. !< switch for (non)-equidistant mode. logical :: lmomsubs = .false. !< switch to apply subsidence on the momentum or not @@ -433,7 +433,7 @@ subroutine initglobal do k=1,k1 delta(k) = (dx*dy*dzf(k))**(1./3.) - deltai(k) = 1./delta(k) + deltai(k) = 1./delta(k) !can be overruled in modsubgrid in case anisotropic diffusion is applied end do !-------------------------------------------------- diff --git a/src/modmpi.f90 b/src/modmpi.f90 index 5564689d..c4347a18 100644 --- a/src/modmpi.f90 +++ b/src/modmpi.f90 @@ -35,6 +35,9 @@ module modmpi use modmpiinterface +#if defined(_OPENACC) +use openacc +#endif implicit none save type(MPI_COMM) :: commwrld, comm3d, commrow, commcol @@ -332,6 +335,9 @@ subroutine exitmpi end subroutine exitmpi subroutine excjs_real32(a,sx,ex,sy,ey,sz,ez,ih,jh) +#if defined(_OPENACC) + use openacc +#endif implicit none integer sx, ex, sy, ey, sz, ez, ih, jh real(real32) a(sx-ih:ex+ih, sy-jh:ey+jh, sz:ez) @@ -344,11 +350,18 @@ subroutine excjs_real32(a,sx,ex,sy,ey,sz,ez,ih,jh) , sends,recvs & , sende,recve & , sendw,recvw + ! Check if the data is on the gpu +#if defined(_OPENACC) + logical :: is_present + is_present = acc_is_present(a) +#endif ! Calulate buffer lengths + !$acc kernels default(present) if(is_present) xl = size(a,1) yl = size(a,2) zl = size(a,3) + !$acc end kernels ! Calculate buffer size nssize = xl*jh*zl @@ -382,8 +395,10 @@ subroutine excjs_real32(a,sx,ex,sy,ey,sz,ez,ih,jh) else ! Single processor, make sure the field is periodic + !$acc kernels default(present) if(is_present) a(:,sy-jh:sy-1,:) = a(:,ey-jh+1:ey,:) a(:,ey+1:ey+jh,:) = a(:,sy:sy+jh-1,:) + !$acc end kernels endif @@ -414,9 +429,10 @@ subroutine excjs_real32(a,sx,ex,sy,ey,sz,ez,ih,jh) else ! Single processor, make sure the field is periodic + !$acc kernels default(present) if(is_present) a(sx-ih:sx-1,:,:) = a(ex-ih+1:ex,:,:) a(ex+1:ex+ih,:,:) = a(sx:sx+ih-1,:,:) - + !$acc end kernels endif if(nprocy.gt.1)then @@ -444,6 +460,9 @@ subroutine excjs_real32(a,sx,ex,sy,ey,sz,ez,ih,jh) end subroutine excjs_real32 subroutine excjs_real64(a,sx,ex,sy,ey,sz,ez,ih,jh) +#if defined(_OPENACC) + use openacc +#endif implicit none integer sx, ex, sy, ey, sz, ez, ih, jh real(real64) a(sx-ih:ex+ih, sy-jh:ey+jh, sz:ez) @@ -456,11 +475,18 @@ subroutine excjs_real64(a,sx,ex,sy,ey,sz,ez,ih,jh) , sends,recvs & , sende,recve & , sendw,recvw + ! Check if the data is on the gpu +#if defined(_OPENACC) + logical :: is_present + is_present = acc_is_present(a) +#endif ! Calulate buffer lengths + !$acc kernels default(present) if(is_present) xl = size(a,1) yl = size(a,2) zl = size(a,3) + !$acc end kernels ! Calculate buffer size nssize = xl*jh*zl @@ -495,8 +521,10 @@ subroutine excjs_real64(a,sx,ex,sy,ey,sz,ez,ih,jh) else ! Single processor, make sure the field is periodic + !$acc kernels default(present) if(is_present) a(:,sy-jh:sy-1,:) = a(:,ey-jh+1:ey,:) a(:,ey+1:ey+jh,:) = a(:,sy:sy+jh-1,:) + !$acc end kernels endif @@ -527,8 +555,10 @@ subroutine excjs_real64(a,sx,ex,sy,ey,sz,ez,ih,jh) else ! Single processor, make sure the field is periodic + !$acc kernels default(present) if(is_present) a(sx-ih:sx-1,:,:) = a(ex-ih+1:ex,:,:) a(ex+1:ex+ih,:,:) = a(sx:sx+ih-1,:,:) + !$acc end kernels endif diff --git a/src/modsubgrid.f90 b/src/modsubgrid.f90 index ab559806..acc5b2fa 100644 --- a/src/modsubgrid.f90 +++ b/src/modsubgrid.f90 @@ -31,20 +31,21 @@ module modsubgrid use modsubgriddata use modprecision, only: field_r +use modtimer, only: timer_tic, timer_toc implicit none save public :: subgrid, initsubgrid, exitsubgrid, subgridnamelist contains subroutine initsubgrid - use modglobal, only : ih,i1,jh,j1,k1,delta,zf,fkar,pi + use modglobal, only : ih,i1,jh,j1,k1,delta,deltai,dx,dy,zf,dzf,fkar,pi use modmpi, only : myid implicit none integer :: k - real :: ceps, ch + real :: ceps real :: mlen call subgridnamelist @@ -56,9 +57,12 @@ subroutine initsubgrid allocate(sbshr(2-ih:i1+ih,2-jh:j1+jh,1)) allocate(sbbuo(2-ih:i1+ih,2-jh:j1+jh,1)) allocate(csz(k1)) + allocate(anis_fac(k1)) ! Initialize variables to avoid problems when not using subgrid scheme JvdD + ! Determination of subgrid constants is explained in De Roode et al. 2017 ekm=0.; ekh=0.; zlt=0.; sbdiss=0.; sbshr=0.; sbbuo=0.; csz=0. + anis_fac = 0. cm = cf / (2. * pi) * (1.5*alpha_kolm)**(-1.5) @@ -83,6 +87,21 @@ subroutine initsubgrid end do end if + if(lanisotrop) then + ! Anisotropic diffusion scheme https://doi.org/10.1029/2022MS003095 + ! length scale in TKE equation is delta z (private communication with Marat) + if ((dx.ne.dy) .and. myid == 0) stop "The anisotropic diffusion assumes dx=dy." + do k = 1,k1 + deltai (k) = 1./dzf(k) !overrules deltai (k) = 1/delta(k) as defined in initglobal + anis_fac (k) = (dx/dzf(k))**2 !assumes dx=dy. is used to enhance horizontal diffusion + end do + else + do k = 1,k1 + anis_fac (k) = 1. !horizontal = vertical diffusion + end do + endif + + if (myid==0) then write (6,*) 'cf = ',cf write (6,*) 'cm = ',cm @@ -95,7 +114,8 @@ subroutine initsubgrid write (6,*) 'cs = ',cs write (6,*) 'Rigc = ',Rigc endif - + + !$acc enter data create(anis_fac) end subroutine initsubgrid subroutine subgridnamelist @@ -107,7 +127,7 @@ subroutine subgridnamelist integer :: ierr namelist/NAMSUBGRID/ & - ldelta,lmason,cf,cn,Rigc,Prandtl,lsmagorinsky,cs,nmason,ch1 + ldelta,lmason,cf,cn,Rigc,Prandtl,lsmagorinsky,cs,nmason,sgs_surface_fix,ch1,lanisotrop if(myid==0)then open(ifnamopt,file=fname_options,status='old',iostat=ierr) @@ -115,12 +135,16 @@ subroutine subgridnamelist call checknamelisterror(ierr, ifnamopt, 'NAMSUBGRID') write(6 ,NAMSUBGRID) close(ifnamopt) + + if (lmason .and. .not. ldelta) stop "lmason = .true. requires ldelta = .true." + if (lmason .and. lanisotrop) stop "lmason = .true. is not compatible with lanisotropic = .true." end if call D_MPI_BCAST(ldelta ,1, 0,comm3d,mpierr) call D_MPI_BCAST(lmason ,1, 0,comm3d,mpierr) call D_MPI_BCAST(nmason ,1, 0,comm3d,mpierr) call D_MPI_BCAST(lsmagorinsky ,1, 0,comm3d,mpierr) + call D_MPI_BCAST(lanisotrop ,1, 0,comm3d,mpierr) call D_MPI_BCAST(cs ,1, 0,comm3d,mpierr) call D_MPI_BCAST(cf ,1, 0,comm3d,mpierr) call D_MPI_BCAST(cn ,1, 0,comm3d,mpierr) @@ -128,7 +152,6 @@ subroutine subgridnamelist call D_MPI_BCAST(Prandtl ,1, 0,comm3d,mpierr) call D_MPI_BCAST(sgs_surface_fix ,1, 0,comm3d,mpierr) call D_MPI_BCAST(ch1 ,1, 0,comm3d,mpierr) - end subroutine subgridnamelist subroutine subgrid @@ -136,13 +159,24 @@ subroutine subgrid ! Diffusion subroutines ! Thijs Heus, Chiel van Heerwaarden, 15 June 2007 - use modglobal, only : nsv, lmoist - use modfields, only : up,vp,wp,e12p,thl0,thlp,qt0,qtp,sv0,svp - use modsurfdata,only : thlflux,qtflux,svflux + use modglobal, only : nsv, lmoist, deltai, delta, dzf, dzh + use modfields, only : up,vp,wp,e12p,thl0,thlp,qt0,qtp,sv0,svp,u0,v0,w0,rhobh,rhobf,e120,dthvdz, thvf + use modsurfdata,only : thlflux,qtflux,svflux,ustar,dudz,dvdz + implicit none integer n + ! Already on GPU: u0, v0, w0, 120, thl0, qt0, dzf, dzh, rhobf, rhobh, + ! up, vp, wp, e12p, thlp, qtp + + !$acc enter data copyin(thvf, thlflux, dthvdz, qtflux, delta, deltai, ustar) + !$acc enter data copyin(dudz, dvdz) + !$acc enter data copyin(ekm, ekh, zlt, sbdiss, sbbuo, sbshr, csz) + + call timer_tic("closure", 0) call closure + call timer_toc("closure") + !$acc wait call diffu(up) call diffv(vp) call diffw(wp) @@ -153,11 +187,17 @@ subroutine subgrid call diffc(sv0(:,:,:,n),svp(:,:,:,n),svflux(:,:,n)) end do if (.not. lsmagorinsky) call sources + + !$acc exit data copyout(up, vp, wp) + !$acc exit data copyout(e12p, thlp, qtp) + !$acc exit data copyout(ekm, ekh, zlt) + !$acc exit data delete(u0, v0, w0, e120, thl0, qt0, dzf, dzh, rhobf, rhobh) + !$acc exit data delete(csz) end subroutine subroutine exitsubgrid implicit none - deallocate(ekm,ekh,zlt,sbdiss,sbbuo,sbshr,csz) + deallocate(ekm,ekh,zlt,sbdiss,sbbuo,sbshr,csz,anis_fac) end subroutine exitsubgrid subroutine closure @@ -202,81 +242,88 @@ subroutine closure implicit none real :: strain2,mlen - integer :: i,j,k,kp,km,jp,jm + integer :: i,j,k if(lsmagorinsky) then - do k = 1,kmax - mlen = csz(k) * delta(k) - + call timer_tic("Smagorinsky", 1) + ! First level + mlen = csz(1) * delta(1) + !$acc parallel loop collapse(2) private(strain2) async(1) + do i = 2,i1 + do j = 2,j1 + strain2 = ( & + ((u0(i+1,j,1)-u0(i,j,1)) *dxi )**2 + & + ((v0(i,j+1,1)-v0(i,j,1)) *dyi )**2 + & + ((w0(i,j,2)-w0(i,j,1)) /dzf(1) )**2 ) + + strain2 = strain2 + 0.5 * ( & + ( 0.25*(w0(i+1,j,2)-w0(i-1,j,2))*dxi + & + dudz(i,j) )**2 ) + + strain2 = strain2 + 0.125 * ( & + ((u0(i,j+1,1)-u0(i,j,1)) *dyi + & + (v0(i,j+1,1)-v0(i-1,j+1,1)) *dxi )**2 + & + ((u0(i,j,1)-u0(i,j-1,1)) *dyi + & + (v0(i,j,1)-v0(i-1,j,1)) *dxi )**2 + & + ((u0(i+1,j,1)-u0(i+1,j-1,1)) *dyi + & + (v0(i+1,j,1)-v0(i,j,1)) *dxi )**2 + & + ((u0(i+1,j+1,1)-u0(i+1,j,1)) *dyi + & + (v0(i+1,j+1,1)-v0(i,j+1,1)) *dxi )**2 ) + + strain2 = strain2 + 0.5 * ( & + (0.25*(w0(i,j+1,2)-w0(i,j-1,2))*dyi + & + dvdz(i,j) )**2 ) + + ekm(i,j,1) = mlen ** 2. * sqrt(2. * strain2) + ekh(i,j,1) = ekm(i,j,1) / Prandtl + + ekm(i,j,1) = max(ekm(i,j,1),ekmin) + ekh(i,j,1) = max(ekh(i,j,1),ekmin) + end do + end do + + ! Other levels + ! TODO: make this collapsable + !$acc loop private(mlen) + do k = 2,kmax + mlen = csz(k) * delta(k) + !$acc parallel loop collapse(2) private(strain2) async(1) do i = 2,i1 do j = 2,j1 - - kp=k+1 - km=k-1 - jp=j+1 - jm=j-1 - - if(k == 1) then - strain2 = ( & - ((u0(i+1,j,k)-u0(i,j,k)) *dxi )**2 + & - ((v0(i,jp,k)-v0(i,j,k)) *dyi )**2 + & - ((w0(i,j,kp)-w0(i,j,k)) /dzf(k) )**2 ) - - strain2 = strain2 + 0.5 * ( & - ( 0.25*(w0(i+1,j,kp)-w0(i-1,j,kp))*dxi + & - dudz(i,j) )**2 ) - - strain2 = strain2 + 0.125 * ( & - ((u0(i,jp,k)-u0(i,j,k)) *dyi + & - (v0(i,jp,k)-v0(i-1,jp,k)) *dxi )**2 + & - ((u0(i,j,k)-u0(i,jm,k)) *dyi + & - (v0(i,j,k)-v0(i-1,j,k)) *dxi )**2 + & - ((u0(i+1,j,k)-u0(i+1,jm,k)) *dyi + & - (v0(i+1,j,k)-v0(i,j,k)) *dxi )**2 + & - ((u0(i+1,jp,k)-u0(i+1,j,k)) *dyi + & - (v0(i+1,jp,k)-v0(i,jp,k)) *dxi )**2 ) - - strain2 = strain2 + 0.5 * ( & - ( 0.25*(w0(i,jp,kp)-w0(i,jm,kp))*dyi + & - dvdz(i,j) )**2 ) - - else - - strain2 = ( & - ((u0(i+1,j,k)-u0(i,j,k)) *dxi )**2 + & - ((v0(i,jp,k)-v0(i,j,k)) *dyi )**2 + & - ((w0(i,j,kp)-w0(i,j,k)) /dzf(k) )**2 ) - - strain2 = strain2 + 0.125 * ( & - ((w0(i,j,kp)-w0(i-1,j,kp)) *dxi + & - (u0(i,j,kp)-u0(i,j,k)) / dzh(kp) )**2 + & - ((w0(i,j,k)-w0(i-1,j,k)) *dxi + & - (u0(i,j,k)-u0(i,j,km)) / dzh(k) )**2 + & - ((w0(i+1,j,k)-w0(i,j,k)) *dxi + & - (u0(i+1,j,k)-u0(i+1,j,km)) / dzh(k) )**2 + & - ((w0(i+1,j,kp)-w0(i,j,kp)) *dxi + & - (u0(i+1,j,kp)-u0(i+1,j,k)) / dzh(kp) )**2 ) - - strain2 = strain2 + 0.125 * ( & - ((u0(i,jp,k)-u0(i,j,k)) *dyi + & - (v0(i,jp,k)-v0(i-1,jp,k)) *dxi )**2 + & - ((u0(i,j,k)-u0(i,jm,k)) *dyi + & - (v0(i,j,k)-v0(i-1,j,k)) *dxi )**2 + & - ((u0(i+1,j,k)-u0(i+1,jm,k)) *dyi + & - (v0(i+1,j,k)-v0(i,j,k)) *dxi )**2 + & - ((u0(i+1,jp,k)-u0(i+1,j,k)) *dyi + & - (v0(i+1,jp,k)-v0(i,jp,k)) *dxi )**2 ) - - strain2 = strain2 + 0.125 * ( & - ((v0(i,j,kp)-v0(i,j,k)) / dzh(kp) + & - (w0(i,j,kp)-w0(i,jm,kp)) *dyi )**2 + & - ((v0(i,j,k)-v0(i,j,km)) / dzh(k)+ & - (w0(i,j,k)-w0(i,jm,k)) *dyi )**2 + & - ((v0(i,jp,k)-v0(i,jp,km)) / dzh(k)+ & - (w0(i,jp,k)-w0(i,j,k)) *dyi )**2 + & - ((v0(i,jp,kp)-v0(i,jp,k)) / dzh(kp) + & - (w0(i,jp,kp)-w0(i,j,kp)) *dyi )**2 ) - end if + strain2 = ( & + ((u0(i+1,j,k)-u0(i,j,k)) *dxi )**2 + & + ((v0(i,j+1,k)-v0(i,j,k)) *dyi )**2 + & + ((w0(i,j,k+1)-w0(i,j,k)) /dzf(k) )**2 ) + + strain2 = strain2 + 0.125 * ( & + ((w0(i,j,k+1)-w0(i-1,j,k+1)) *dxi + & + (u0(i,j,k+1)-u0(i,j,k)) /dzh(k+1) )**2 + & + ((w0(i,j,k)-w0(i-1,j,k)) *dxi + & + (u0(i,j,k)-u0(i,j,k-1)) /dzh(k) )**2 + & + ((w0(i+1,j,k)-w0(i,j,k)) *dxi + & + (u0(i+1,j,k)-u0(i+1,j,k-1)) /dzh(k) )**2 + & + ((w0(i+1,j,k+1)-w0(i,j,k+1)) *dxi + & + (u0(i+1,j,k+1)-u0(i+1,j,k)) /dzh(k+1) )**2 ) + + strain2 = strain2 + 0.125 * ( & + ((u0(i,j+1,k)-u0(i,j,k)) *dyi + & + (v0(i,j+1,k)-v0(i-1,j+1,k)) *dxi )**2 + & + ((u0(i,j,k)-u0(i,j-1,k)) *dyi + & + (v0(i,j,k)-v0(i-1,j,k)) *dxi )**2 + & + ((u0(i+1,j,k)-u0(i+1,j-1,k)) *dyi + & + (v0(i+1,j,k)-v0(i,j,k)) *dxi )**2 + & + ((u0(i+1,j+1,k)-u0(i+1,j,k)) *dyi + & + (v0(i+1,j+1,k)-v0(i,j+1,k)) *dxi )**2 ) + + strain2 = strain2 + 0.125 * ( & + ((v0(i,j,k+1)-v0(i,j,k)) /dzh(k+1) + & + (w0(i,j,k+1)-w0(i,j-1,k+1)) *dyi )**2 + & + ((v0(i,j,k)-v0(i,j,k-1)) /dzh(k) + & + (w0(i,j,k)-w0(i,j-1,k)) *dyi )**2 + & + ((v0(i,j+1,k)-v0(i,j+1,k-1)) /dzh(k) + & + (w0(i,j+1,k)-w0(i,j,k)) *dyi )**2 + & + ((v0(i,j+1,k+1)-v0(i,j+1,k)) /dzh(k+1) + & + (w0(i,j+1,k+1)-w0(i,j,k+1)) *dyi )**2 ) ekm(i,j,k) = mlen ** 2. * sqrt(2. * strain2) ekh(i,j,k) = ekm(i,j,k) / Prandtl @@ -286,54 +333,103 @@ subroutine closure end do end do end do - + call timer_toc("Smagorinsky") ! do TKE scheme - else - do k=1,kmax - do j=2,j1 - do i=2,i1 - if (ldelta .or. (dthvdz(i,j,k)<=0)) then - zlt(i,j,k) = delta(k) - if (lmason) zlt(i,j,k) = (1. / zlt(i,j,k) ** nmason + 1. / ( fkar * (zf(k) + z0m(i,j)))**nmason) ** (-1./nmason) - ekm(i,j,k) = cm * zlt(i,j,k) * e120(i,j,k) - ekh(i,j,k) = (ch1 + ch2) * ekm(i,j,k) - - ekm(i,j,k) = max(ekm(i,j,k),ekmin) - ekh(i,j,k) = max(ekh(i,j,k),ekmin) - else - ! zlt(i,j,k) = min(delta(k),cn*e120(i,j,k)/sqrt(grav/thvf(k)*abs(dthvdz(i,j,k)))) - ! faster calculation: evaluate sqrt only if the second argument is actually smaller - zlt(i,j,k) = delta(k) - if ( grav*abs(dthvdz(i,j,k)) * delta(k)**2 > (cn*e120(i,j,k))**2 * thvf(k) ) then - zlt(i,j,k) = cn*e120(i,j,k)/sqrt(grav/thvf(k)*abs(dthvdz(i,j,k))) - end if - - if (lmason) zlt(i,j,k) = (1. / zlt(i,j,k) ** nmason + 1. / ( fkar * (zf(k) + z0m(i,j)))**nmason) ** (-1./nmason) - - ekm(i,j,k) = cm * zlt(i,j,k) * e120(i,j,k) - ekh(i,j,k) = (ch1 + ch2 * zlt(i,j,k)*deltai(k)) * ekm(i,j,k) - - ekm(i,j,k) = max(ekm(i,j,k),ekmin) - ekh(i,j,k) = max(ekh(i,j,k),ekmin) - endif - end do - end do - end do + else + ! choose one of ldelta, ldelta+lmason, lanisotropic, or none of them for Deardorff length scale adjustment + if (ldelta .and. .not. lmason) then + !$acc parallel loop collapse(3) default(present) + do k=1,kmax + do j=2,j1 + do i=2,i1 + zlt(i,j,k) = delta(k) + + ekm(i,j,k) = cm * zlt(i,j,k) * e120(i,j,k) + ekh(i,j,k) = ch * ekm(i,j,k) + + ekm(i,j,k) = max(ekm(i,j,k),ekmin) + ekh(i,j,k) = max(ekh(i,j,k),ekmin) + end do + end do + end do + else if (ldelta .and. lmason) then ! delta scheme with Mason length scale correction + !$acc parallel loop collapse(3) default(present) + do k=1,kmax + do j=2,j1 + do i=2,i1 + zlt(i,j,k) = delta(k) + zlt(i,j,k) = (1. / zlt(i,j,k) ** nmason + 1. / ( fkar * (zf(k) + z0m(i,j)))**nmason) ** (-1./nmason) + + ekm(i,j,k) = cm * zlt(i,j,k) * e120(i,j,k) + ekh(i,j,k) = ch * ekm(i,j,k) + + ekm(i,j,k) = max(ekm(i,j,k),ekmin) + ekh(i,j,k) = max(ekh(i,j,k),ekmin) + end do + end do + end do + else if (lanisotrop) then ! Anisotropic diffusion, https://doi.org/10.1029/2022MS003095 + !$acc parallel loop collapse(3) default(present) + do k=1,kmax + do j=2,j1 + do i=2,i1 + zlt(i,j,k) = dzf(k) + + ekm(i,j,k) = cm * zlt(i,j,k) * e120(i,j,k) + ekh(i,j,k) = ch * ekm(i,j,k) + + ekm(i,j,k) = max(ekm(i,j,k),ekmin) + ekh(i,j,k) = max(ekh(i,j,k),ekmin) + end do + end do + end do + else ! Deardorff lengthscale correction + !$acc parallel loop collapse(3) default(present) + do k=1,kmax + do j=2,j1 + do i=2,i1 + zlt(i,j,k) = delta(k) + + !original + !if (dthvdz(i,j,k) > 0) then + !zlt(i,j,k) = min(delta(k),cn*e120(i,j,k)/sqrt(grav/thvf(k)*abs(dthvdz(i,j,k)))) + !end if + + ! alternative without if + zlt(i,j,k) = min(delta(k), & + cn*e120(i,j,k) / sqrt( grav/thvf(k) * abs(dthvdz(i,j,k))) + & + delta(k) * (1.0-sign(1.0_field_r,dthvdz(i,j,k)))) + ! the final line is 0 if dthvdz(i,j,k) > 0, else 2*delta(k) + ! ensuring that zlt(i,j,k) = delta(k) when dthvdz < 0, as + ! in the original scheme. + + ekm(i,j,k) = cm * zlt(i,j,k) * e120(i,j,k) + ekh(i,j,k) = (ch1 + ch2 * zlt(i,j,k)*deltai(k)) * ekm(i,j,k) + + ekm(i,j,k) = max(ekm(i,j,k),ekmin) + ekh(i,j,k) = max(ekh(i,j,k),ekmin) + end do + end do + end do + end if end if - !************************************************************* ! Set cyclic boundary condition for K-closure factors. !************************************************************* - + call timer_tic("Boundary conditions", 2) call excjs( ekm , 2,i1,2,j1,1,k1,ih,jh) call excjs( ekh , 2,i1,2,j1,1,k1,ih,jh) + call timer_toc("Boundary conditions") + call timer_tic("Write buffer", 3) + !$acc parallel loop collapse(2) default(present) do j=1,j2 do i=1,i2 ekm(i,j,k1) = ekm(i,j,kmax) ekh(i,j,k1) = ekh(i,j,kmax) end do end do + call timer_toc("Write buffer") return end subroutine closure @@ -369,58 +465,52 @@ subroutine sources implicit none real tdef2, uwflux, vwflux, local_dudz, local_dvdz, local_dthvdz, horv - integer i,j,k,jm,jp,km,kp - + integer i,j,k + !$acc parallel loop collapse(3) default(present) private(tdef2) do k=2,kmax - do j=2,j1 - do i=2,i1 - kp=k+1 - km=k-1 - jp=j+1 - jm=j-1 - - tdef2 = 2. * ( & - ((u0(i+1,j,k)-u0(i,j,k)) /dx )**2 + & - ((v0(i,jp,k)-v0(i,j,k)) /dy )**2 + & - ((w0(i,j,kp)-w0(i,j,k)) /dzf(k) )**2 ) - - tdef2 = tdef2 + 0.25 * ( & - ((w0(i,j,kp)-w0(i-1,j,kp)) / dx + & - (u0(i,j,kp)-u0(i,j,k)) / dzh(kp) )**2 + & - ((w0(i,j,k)-w0(i-1,j,k)) / dx + & - (u0(i,j,k)-u0(i,j,km)) / dzh(k) )**2 + & - ((w0(i+1,j,k)-w0(i,j,k)) / dx + & - (u0(i+1,j,k)-u0(i+1,j,km)) / dzh(k) )**2 + & - ((w0(i+1,j,kp)-w0(i,j,kp)) / dx + & - (u0(i+1,j,kp)-u0(i+1,j,k)) / dzh(kp) )**2 ) - - tdef2 = tdef2 + 0.25 * ( & - ((u0(i,jp,k)-u0(i,j,k)) / dy + & - (v0(i,jp,k)-v0(i-1,jp,k)) / dx )**2 + & - ((u0(i,j,k)-u0(i,jm,k)) / dy + & - (v0(i,j,k)-v0(i-1,j,k)) / dx )**2 + & - ((u0(i+1,j,k)-u0(i+1,jm,k)) / dy + & - (v0(i+1,j,k)-v0(i,j,k)) / dx )**2 + & - ((u0(i+1,jp,k)-u0(i+1,j,k)) / dy + & - (v0(i+1,jp,k)-v0(i,jp,k)) / dx )**2 ) - - tdef2 = tdef2 + 0.25 * ( & - ((v0(i,j,kp)-v0(i,j,k)) / dzh(kp) + & - (w0(i,j,kp)-w0(i,jm,kp)) / dy )**2 + & - ((v0(i,j,k)-v0(i,j,km)) / dzh(k)+ & - (w0(i,j,k)-w0(i,jm,k)) / dy )**2 + & - ((v0(i,jp,k)-v0(i,jp,km)) / dzh(k)+ & - (w0(i,jp,k)-w0(i,j,k)) / dy )**2 + & - ((v0(i,jp,kp)-v0(i,jp,k)) / dzh(kp) + & - (w0(i,jp,kp)-w0(i,j,kp)) / dy )**2 ) - - e12p(i,j,k) = e12p(i,j,k) & - + (ekm(i,j,k)*tdef2 - ekh(i,j,k)*grav/thvf(k)*dthvdz(i,j,k) ) / (2*e120(i,j,k)) & ! sbshr and sbbuo - - (ce1 + ce2*zlt(i,j,k)*deltai(k)) * e120(i,j,k)**2 /(2.*zlt(i,j,k)) ! sbdiss - - end do - end do + do j=2,j1 + do i=2,i1 + tdef2 = 2. * ( & + ((u0(i+1,j,k)-u0(i,j,k)) /dx )**2 + & + ((v0(i,j+1,k)-v0(i,j,k)) /dy )**2 + & + ((w0(i,j,k+1)-w0(i,j,k)) /dzf(k) )**2 ) + + tdef2 = tdef2 + 0.25 * ( & + ((w0(i,j,k+1)-w0(i-1,j,k+1)) / dx + & + (u0(i,j,k+1)-u0(i,j,k)) / dzh(k+1) )**2 + & + ((w0(i,j,k)-w0(i-1,j,k)) / dx + & + (u0(i,j,k)-u0(i,j,k-1)) / dzh(k) )**2 + & + ((w0(i+1,j,k)-w0(i,j,k)) / dx + & + (u0(i+1,j,k)-u0(i+1,j,k-1)) / dzh(k) )**2 + & + ((w0(i+1,j,k+1)-w0(i,j,k+1)) / dx + & + (u0(i+1,j,k+1)-u0(i+1,j,k)) / dzh(k+1) )**2 ) + + tdef2 = tdef2 + 0.25 * ( & + ((u0(i,j+1,k)-u0(i,j,k)) / dy + & + (v0(i,j+1,k)-v0(i-1,j+1,k)) / dx )**2 + & + ((u0(i,j,k)-u0(i,j-1,k)) / dy + & + (v0(i,j,k)-v0(i-1,j,k)) / dx )**2 + & + ((u0(i+1,j,k)-u0(i+1,j-1,k)) / dy + & + (v0(i+1,j,k)-v0(i,j,k)) / dx )**2 + & + ((u0(i+1,j+1,k)-u0(i+1,j,k)) / dy + & + (v0(i+1,j+1,k)-v0(i,j+1,k)) / dx )**2 ) + + tdef2 = tdef2 + 0.25 * ( & + ((v0(i,j,k+1)-v0(i,j,k)) / dzh(k+1) + & + (w0(i,j,k+1)-w0(i,j-1,k+1)) / dy )**2 + & + ((v0(i,j,k)-v0(i,j,k-1)) / dzh(k) + & + (w0(i,j,k)-w0(i,j-1,k)) / dy )**2 + & + ((v0(i,j+1,k)-v0(i,j+1,k-1)) / dzh(k) + & + (w0(i,j+1,k)-w0(i,j,k)) / dy )**2 + & + ((v0(i,j+1,k+1)-v0(i,j+1,k)) / dzh(k+1) + & + (w0(i,j+1,k+1)-w0(i,j,k+1)) / dy )**2 ) + + e12p(i,j,k) = e12p(i,j,k) & + + (ekm(i,j,k)*tdef2 - ekh(i,j,k)*grav/thvf(k)*dthvdz(i,j,k) ) / (2*e120(i,j,k)) & ! sbshr and sbbuo + - (ce1 + ce2*zlt(i,j,k)*deltai(k)) * e120(i,j,k)**2 /(2.*zlt(i,j,k)) ! sbdiss + end do + end do end do ! ----------------------------------------------end i,j,k-loop @@ -428,80 +518,155 @@ subroutine sources ! special treatment for lowest full level: k=1 ! -------------------------------------------- - - do j=2,j1 - jp=j+1 - jm=j-1 - do i=2,i1 - - - -! ** Calculate "shear" production term: tdef2 **************** - - tdef2 = 2. * ( & - ((u0(i+1,j,1)-u0(i,j,1))*dxi)**2 & - + ((v0(i,jp,1)-v0(i,j,1))*dyi)**2 & - + ((w0(i,j,2)-w0(i,j,1))/dzf(1))**2 ) - - if (sgs_surface_fix) then - ! Use known surface flux and exchange coefficient to derive - ! consistent gradient (such that correct flux will occur in - ! shear production term) - ! Make sure that no division by zero occurs in determination of the - ! directional component; ekm should already be >= ekmin - ! Replace the dudz by surface flux -uw / ekm - horv = max(sqrt((u0(i,j,1)+cu)**2+(v0(i,j,1)+cv)**2), 0.01) - uwflux = -ustar(i,j)*ustar(i,j)* ((u0(i,j,1)+cu)/horv) - local_dudz = -uwflux / ekm(i,j,1) - tdef2 = tdef2 + ( 0.25*(w0(i+1,j,2)-w0(i-1,j,2))*dxi + & - local_dudz )**2 - else - tdef2 = tdef2 + ( 0.25*(w0(i+1,j,2)-w0(i-1,j,2))*dxi + & - dudz(i,j) )**2 - endif - - tdef2 = tdef2 + 0.25 *( & - ((u0(i,jp,1)-u0(i,j,1))*dyi+(v0(i,jp,1)-v0(i-1,jp,1))*dxi)**2 & - +((u0(i,j,1)-u0(i,jm,1))*dyi+(v0(i,j,1)-v0(i-1,j,1))*dxi)**2 & - +((u0(i+1,j,1)-u0(i+1,jm,1))*dyi+(v0(i+1,j,1)-v0(i,j,1))*dxi)**2 & - +((u0(i+1,jp,1)-u0(i+1,j,1))*dyi+ & - (v0(i+1,jp,1)-v0(i,jp,1))*dxi)**2 ) - - if (sgs_surface_fix) then - ! Use known surface flux and exchange coefficient to derive - ! consistent gradient (such that correct flux will occur in - ! shear production term) - ! Make sure that no division by zero occurs in determination of the - ! directional component; ekm should already be >= ekmin - ! Replace the dvdz by surface flux -vw / ekm - horv = max(sqrt((u0(i,j,1)+cu)**2+(v0(i,j,1)+cv)**2), 0.01) - vwflux = -ustar(i,j)*ustar(i,j)* ((v0(i,j,1)+cv)/horv) - local_dvdz = -vwflux / ekm(i,j,1) - tdef2 = tdef2 + ( 0.25*(w0(i,jp,2)-w0(i,jm,2))*dyi + & - local_dvdz )**2 - else - tdef2 = tdef2 + ( 0.25*(w0(i,jp,2)-w0(i,jm,2))*dyi + & + if (sgs_surface_fix) then + !$acc parallel loop collapse(2) default(present) private(tdef2,horv,uwflux,vwflux,local_dudz,local_dvdz,local_dthvdz) + do j=2,j1 + do i=2,i1 + tdef2 = 2. * ( & + ((u0(i+1,j,1)-u0(i,j,1)) *dxi )**2 & + + ((v0(i,j+1,1)-v0(i,j,1)) *dyi )**2 & + + ((w0(i,j,2)-w0(i,j,1)) /dzf(1) )**2 ) + ! Use known surface flux and exchange coefficient to derive + ! consistent gradient (such that correct flux will occur in + ! shear production term) + ! Make sure that no division by zero occurs in determination of the + ! directional component; ekm should already be >= ekmin + ! Replace the dudz by surface flux -uw / ekm + horv = max(sqrt((u0(i,j,1)+cu)**2 + (v0(i,j,1)+cv)**2), 0.01) + uwflux = -ustar(i,j)*ustar(i,j) * ((u0(i,j,1)+cu)/horv) + local_dudz = -uwflux / ekm(i,j,1) + + tdef2 = tdef2 + ( 0.25*(w0(i+1,j,2)-w0(i-1,j,2))*dxi + & + local_dudz )**2 + + tdef2 = tdef2 + 0.25 *( & + ((u0(i,j+1,1)-u0(i,j,1))*dyi+(v0(i,j+1,1)-v0(i-1,j+1,1))*dxi)**2 & + +((u0(i,j,1)-u0(i,j-1,1))*dyi+(v0(i,j,1)-v0(i-1,j,1))*dxi)**2 & + +((u0(i+1,j,1)-u0(i+1,j-1,1))*dyi+(v0(i+1,j,1)-v0(i,j,1))*dxi)**2 & + +((u0(i+1,j+1,1)-u0(i+1,j,1))*dyi & + +(v0(i+1,j+1,1)-v0(i,j+1,1))*dxi)**2 ) + + ! Use known surface flux and exchange coefficient to derive + ! consistent gradient (such that correct flux will occur in + ! shear production term) + ! Make sure that no division by zero occurs in determination of the + ! directional component; ekm should already be >= ekmin + ! Replace the dvdz by surface flux -vw / ekm + vwflux = -ustar(i,j)*ustar(i,j)* ((v0(i,j,1)+cv)/horv) + local_dvdz = -vwflux / ekm(i,j,1) + tdef2 = tdef2 + ( 0.25*(w0(i,j+1,2)-w0(i,j-1,2))*dyi + & + local_dvdz )**2 + + ! ** Include shear and buoyancy production terms and dissipation ** + sbshr(i,j,1) = ekm(i,j,1)*tdef2/ ( 2*e120(i,j,1)) + + ! Replace the -ekh * dthvdz by the surface flux of thv + ! (but we only have the thlflux , which seems at the surface to be + ! equivalent + local_dthvdz = -thlflux(i,j)/ekh(i,j,1) + sbbuo(i,j,1) = -ekh(i,j,1)*grav/thvf(1)*local_dthvdz/ ( 2*e120(i,j,1)) + sbdiss(i,j,1) = - (ce1 + ce2*zlt(i,j,1)*deltai(1)) * e120(i,j,1)**2 /(2.*zlt(i,j,1)) + end do + end do + else + !$acc parallel loop collapse(2) default(present) private(tdef2) + do j=2,j1 + do i=2,i1 + tdef2 = 2. * ( & + ((u0(i+1,j,1)-u0(i,j,1)) *dxi )**2 & + + ((v0(i,j+1,1)-v0(i,j,1)) *dyi )**2 & + + ((w0(i,j,2)-w0(i,j,1)) /dzf(1) )**2 ) + + tdef2 = tdef2 + ( 0.25*(w0(i+1,j,2)-w0(i-1,j,2))*dxi + & + dudz(i,j) )**2 + + tdef2 = tdef2 + 0.25 *( & + ((u0(i,j+1,1)-u0(i,j,1))*dyi+(v0(i,j+1,1)-v0(i-1,j+1,1))*dxi)**2 & + +((u0(i,j,1)-u0(i,j-1,1))*dyi+(v0(i,j,1)-v0(i-1,j,1))*dxi)**2 & + +((u0(i+1,j,1)-u0(i+1,j-1,1))*dyi+(v0(i+1,j,1)-v0(i,j,1))*dxi)**2 & + +((u0(i+1,j+1,1)-u0(i+1,j,1))*dyi & + +(v0(i+1,j+1,1)-v0(i,j+1,1))*dxi)**2 ) + + tdef2 = tdef2 + ( 0.25*(w0(i,j+1,2)-w0(i,j-1,2))*dyi + & dvdz(i,j) )**2 - endif -! ** Include shear and buoyancy production terms and dissipation ** + sbshr(i,j,1) = ekm(i,j,1)*tdef2/ ( 2*e120(i,j,1)) + sbbuo(i,j,1) = -ekh(i,j,1)*grav/thvf(1)*dthvdz(i,j,1)/ ( 2*e120(i,j,1)) + sbdiss(i,j,1) = - (ce1 + ce2*zlt(i,j,1)*deltai(1)) * e120(i,j,1)**2 /(2.*zlt(i,j,1)) + end do + end do + endif - sbshr(i,j,1) = ekm(i,j,1)*tdef2/ ( 2*e120(i,j,1)) - if (sgs_surface_fix) then - ! Replace the -ekh * dthvdz by the surface flux of thv - ! (but we only have the thlflux , which seems at the surface to be - ! equivalent - local_dthvdz = -thlflux(i,j)/ekh(i,j,1) - sbbuo(i,j,1) = -ekh(i,j,1)*grav/thvf(1)*local_dthvdz/ ( 2*e120(i,j,1)) - else - sbbuo(i,j,1) = -ekh(i,j,1)*grav/thvf(1)*dthvdz(i,j,1)/ ( 2*e120(i,j,1)) - endif - sbdiss(i,j,1) = - (ce1 + ce2*zlt(i,j,1)*deltai(1)) * e120(i,j,1)**2 /(2.*zlt(i,j,1)) - end do - end do +! do j=2,j1 +! do i=2,i1 +! ! ** Calculate "shear" production term: tdef2 **************** +! tdef2 = 2. * ( & +! ((u0(i+1,j,1)-u0(i,j,1))*dxi)**2 & +! + ((v0(i,j+1,1)-v0(i,j,1))*dyi)**2 & +! + ((w0(i,j,2)-w0(i,j,1))/dzf(1))**2 ) +! +! if (sgs_surface_fix) then +! ! Use known surface flux and exchange coefficient to derive +! ! consistent gradient (such that correct flux will occur in +! ! shear production term) +! ! Make sure that no division by zero occurs in determination of the +! ! directional component; ekm should already be >= ekmin +! ! Replace the dudz by surface flux -uw / ekm +! horv = max(sqrt((u0(i,j,1)+cu)**2+(v0(i,j,1)+cv)**2), 0.01) +! uwflux = -ustar(i,j)*ustar(i,j)* ((u0(i,j,1)+cu)/horv) +! local_dudz = -uwflux / ekm(i,j,1) +! tdef2 = tdef2 + ( 0.25*(w0(i+1,j,2)-w0(i-1,j,2))*dxi + & +! local_dudz )**2 +! else +! tdef2 = tdef2 + ( 0.25*(w0(i+1,j,2)-w0(i-1,j,2))*dxi + & +! dudz(i,j) )**2 +! endif +! +! tdef2 = tdef2 + 0.25 *( & +! ((u0(i,j+1,1)-u0(i,j,1))*dyi+(v0(i,j+1,1)-v0(i-1,j+1,1))*dxi)**2 & +! +((u0(i,j,1)-u0(i,j-1,1))*dyi+(v0(i,j,1)-v0(i-1,j,1))*dxi)**2 & +! +((u0(i+1,j,1)-u0(i+1,j-1,1))*dyi+(v0(i+1,j,1)-v0(i,j,1))*dxi)**2 & +! +((u0(i+1,j+1,1)-u0(i+1,j,1))*dyi+ & +! (v0(i+1,j+1,1)-v0(i,j+1,1))*dxi)**2 ) +! +! if (sgs_surface_fix) then +! ! Use known surface flux and exchange coefficient to derive +! ! consistent gradient (such that correct flux will occur in +! ! shear production term) +! ! Make sure that no division by zero occurs in determination of the +! ! directional component; ekm should already be >= ekmin +! ! Replace the dvdz by surface flux -vw / ekm +! horv = max(sqrt((u0(i,j,1)+cu)**2+(v0(i,j,1)+cv)**2), 0.01) +! vwflux = -ustar(i,j)*ustar(i,j)* ((v0(i,j,1)+cv)/horv) +! local_dvdz = -vwflux / ekm(i,j,1) +! tdef2 = tdef2 + ( 0.25*(w0(i,j+1,2)-w0(i,j-1,2))*dyi + & +! local_dvdz )**2 +! else +! tdef2 = tdef2 + ( 0.25*(w0(i,j+1,2)-w0(i,j-1,2))*dyi + & +! dvdz(i,j) )**2 +! endif +! +!! ** Include shear and buoyancy production terms and dissipation ** +! +! sbshr(i,j,1) = ekm(i,j,1)*tdef2/ ( 2*e120(i,j,1)) +! if (sgs_surface_fix) then +! ! Replace the -ekh * dthvdz by the surface flux of thv +! ! (but we only have the thlflux , which seems at the surface to be +! ! equivalent +! local_dthvdz = -thlflux(i,j)/ekh(i,j,1) +! sbbuo(i,j,1) = -ekh(i,j,1)*grav/thvf(1)*local_dthvdz/ ( 2*e120(i,j,1)) +! else +! sbbuo(i,j,1) = -ekh(i,j,1)*grav/thvf(1)*dthvdz(i,j,1)/ ( 2*e120(i,j,1)) +! endif +! sbdiss(i,j,1) = - (ce1 + ce2*zlt(i,j,1)*deltai(1)) * e120(i,j,1)**2 /(2.*zlt(i,j,1)) +! end do +! end do + + !$acc kernels default(present) e12p(2:i1,2:j1,1) = e12p(2:i1,2:j1,1) + & sbshr(2:i1,2:j1,1)+sbbuo(2:i1,2:j1,1)+sbdiss(2:i1,2:j1,1) + !$acc end kernels return end subroutine sources @@ -516,46 +681,42 @@ subroutine diffc (a_in,a_out,flux) real(field_r), intent(inout) :: a_out(2-ih:i1+ih,2-jh:j1+jh,k1) real, intent(in) :: flux (i2,j2) - integer i,j,k,jm,jp,km,kp - + integer i,j,k + + !$acc parallel loop collapse(3) default(present) do k=2,kmax - kp=k+1 - km=k-1 - do j=2,j1 - jp=j+1 - jm=j-1 - do i=2,i1 a_out(i,j,k) = a_out(i,j,k) & + 0.5 * ( & ( (ekh(i+1,j,k)+ekh(i,j,k))*(a_in(i+1,j,k)-a_in(i,j,k)) & - -(ekh(i,j,k)+ekh(i-1,j,k))*(a_in(i,j,k)-a_in(i-1,j,k)))*dx2i & + -(ekh(i,j,k)+ekh(i-1,j,k))*(a_in(i,j,k)-a_in(i-1,j,k)))*dx2i * anis_fac(k) & + & - ( (ekh(i,jp,k)+ekh(i,j,k)) *(a_in(i,jp,k)-a_in(i,j,k)) & - -(ekh(i,j,k)+ekh(i,jm,k)) *(a_in(i,j,k)-a_in(i,jm,k)) )*dy2i & + ( (ekh(i,j+1,k)+ekh(i,j,k)) *(a_in(i,j+1,k)-a_in(i,j,k)) & + -(ekh(i,j,k)+ekh(i,j-1,k)) *(a_in(i,j,k)-a_in(i,j-1,k)) )*dy2i * anis_fac(k) & + & - ( rhobh(kp)/rhobf(k) * (dzf(kp)*ekh(i,j,k) + dzf(k)*ekh(i,j,kp)) & - * (a_in(i,j,kp)-a_in(i,j,k)) / dzh(kp)**2 & + ( rhobh(k+1)/rhobf(k) * (dzf(k+1)*ekh(i,j,k) + dzf(k)*ekh(i,j,k+1)) & + * (a_in(i,j,k+1)-a_in(i,j,k)) / dzh(k+1)**2 & - & - rhobh(k)/rhobf(k) * (dzf(km)*ekh(i,j,k) + dzf(k)*ekh(i,j,km)) & - * (a_in(i,j,k)-a_in(i,j,km)) / dzh(k)**2 )/dzf(k) & + rhobh(k)/rhobf(k) * (dzf(k-1)*ekh(i,j,k) + dzf(k)*ekh(i,j,k-1)) & + * (a_in(i,j,k)-a_in(i,j,k-1)) / dzh(k)**2 )/dzf(k) & ) end do end do end do - + + !$acc parallel loop collapse(2) default(present) do j=2,j1 do i=2,i1 a_out(i,j,1) = a_out(i,j,1) & + 0.5 * ( & ( (ekh(i+1,j,1)+ekh(i,j,1))*(a_in(i+1,j,1)-a_in(i,j,1)) & - -(ekh(i,j,1)+ekh(i-1,j,1))*(a_in(i,j,1)-a_in(i-1,j,1)) )*dx2i & + -(ekh(i,j,1)+ekh(i-1,j,1))*(a_in(i,j,1)-a_in(i-1,j,1)) )*dx2i * anis_fac(1) & + & ( (ekh(i,j+1,1)+ekh(i,j,1))*(a_in(i,j+1,1)-a_in(i,j,1)) & - -(ekh(i,j,1)+ekh(i,j-1,1))*(a_in(i,j,1)-a_in(i,j-1,1)) )*dy2i & + -(ekh(i,j,1)+ekh(i,j-1,1))*(a_in(i,j,1)-a_in(i,j-1,1)) )*dy2i * anis_fac(1) & + & ( rhobh(2)/rhobf(1) * (dzf(2)*ekh(i,j,1) + dzf(1)*ekh(i,j,2)) & * (a_in(i,j,2)-a_in(i,j,1)) / dzh(2)**2 & @@ -576,30 +737,25 @@ subroutine diffe(a_out) implicit none real(field_r), intent(inout) :: a_out(2-ih:i1+ih,2-jh:j1+jh,k1) - integer :: i,j,k,jm,jp,km,kp - + integer :: i,j,k + + !$acc parallel loop collapse(3) default(present) do k=2,kmax - kp=k+1 - km=k-1 - do j=2,j1 - jp=j+1 - jm=j-1 - do i=2,i1 a_out(i,j,k) = a_out(i,j,k) & + ( & ((ekm(i+1,j,k)+ekm(i,j,k))*(e120(i+1,j,k)-e120(i,j,k)) & - -(ekm(i,j,k)+ekm(i-1,j,k))*(e120(i,j,k)-e120(i-1,j,k)))*dx2i & + -(ekm(i,j,k)+ekm(i-1,j,k))*(e120(i,j,k)-e120(i-1,j,k)))*dx2i * anis_fac(k) & + & - ((ekm(i,jp,k)+ekm(i,j,k)) *(e120(i,jp,k)-e120(i,j,k)) & - -(ekm(i,j,k)+ekm(i,jm,k)) *(e120(i,j,k)-e120(i,jm,k)) )*dy2i & + ((ekm(i,j+1,k)+ekm(i,j,k)) *(e120(i,j+1,k)-e120(i,j,k)) & + -(ekm(i,j,k)+ekm(i,j-1,k)) *(e120(i,j,k)-e120(i,j-1,k)) )*dy2i * anis_fac(k) & + & - (rhobh(kp)/rhobf(k) * (dzf(kp)*ekm(i,j,k) + dzf(k)*ekm(i,j,kp)) & - *(e120(i,j,kp)-e120(i,j,k)) / dzh(kp)**2 & - - rhobh(k)/rhobf(k) * (dzf(km)*ekm(i,j,k) + dzf(k)*ekm(i,j,km)) & - *(e120(i,j,k)-e120(i,j,km)) / dzh(k)**2 )/dzf(k) & + (rhobh(k+1)/rhobf(k) * (dzf(k+1)*ekm(i,j,k) + dzf(k)*ekm(i,j,k+1)) & + *(e120(i,j,k+1)-e120(i,j,k)) / dzh(k+1)**2 & + - rhobh(k)/rhobf(k) * (dzf(k-1)*ekm(i,j,k) + dzf(k)*ekm(i,j,k-1)) & + *(e120(i,j,k)-e120(i,j,k-1)) / dzh(k)**2 )/dzf(k) & ) end do @@ -610,15 +766,16 @@ subroutine diffe(a_out) ! special treatment for lowest full level: k=1 ! -------------------------------------------- + !$acc parallel loop collapse(2) do j=2,j1 do i=2,i1 a_out(i,j,1) = a_out(i,j,1) + & ( (ekm(i+1,j,1)+ekm(i,j,1))*(e120(i+1,j,1)-e120(i,j,1)) & - -(ekm(i,j,1)+ekm(i-1,j,1))*(e120(i,j,1)-e120(i-1,j,1)) )*dx2i & + -(ekm(i,j,1)+ekm(i-1,j,1))*(e120(i,j,1)-e120(i-1,j,1)) )*dx2i * anis_fac(1) & + & ( (ekm(i,j+1,1)+ekm(i,j,1))*(e120(i,j+1,1)-e120(i,j,1)) & - -(ekm(i,j,1)+ekm(i,j-1,1))*(e120(i,j,1)-e120(i,j-1,1)) )*dy2i & + -(ekm(i,j,1)+ekm(i,j-1,1))*(e120(i,j,1)-e120(i,j-1,1)) )*dy2i * anis_fac(1) & + & ( rhobh(2)/rhobf(1) * (dzf(2)*ekm(i,j,1) + dzf(1)*ekm(i,j,2)) & * (e120(i,j,2)-e120(i,j,1)) / dzh(2)**2 )/dzf(1) @@ -640,46 +797,41 @@ subroutine diffu (a_out) real :: emmo,emom,emop,empo real :: fu real :: ucu, upcu - integer :: i,j,k,jm,jp,km,kp + integer :: i,j,k + !$acc parallel loop collapse(3) default(present) private(emom, emop, empo, emmo) do k=2,kmax - kp=k+1 - km=k-1 - do j=2,j1 - jp=j+1 - jm=j-1 - do i=2,i1 - emom = ( dzf(km) * ( ekm(i,j,k) + ekm(i-1,j,k) ) + & - dzf(k) * ( ekm(i,j,km) + ekm(i-1,j,km) ) ) / & + emom = ( dzf(k-1) * ( ekm(i,j,k) + ekm(i-1,j,k) ) + & + dzf(k) * ( ekm(i,j,k-1) + ekm(i-1,j,k-1) ) ) / & ( 4. * dzh(k) ) - emop = ( dzf(kp) * ( ekm(i,j,k) + ekm(i-1,j,k) ) + & - dzf(k) * ( ekm(i,j,kp) + ekm(i-1,j,kp) ) ) / & - ( 4. * dzh(kp) ) + emop = ( dzf(k+1) * ( ekm(i,j,k) + ekm(i-1,j,k) ) + & + dzf(k) * ( ekm(i,j,k+1) + ekm(i-1,j,k+1) ) ) / & + ( 4. * dzh(k+1) ) empo = 0.25 * ( & - ekm(i,j,k)+ekm(i,jp,k)+ekm(i-1,jp,k)+ekm(i-1,j,k) ) + ekm(i,j,k)+ekm(i,j+1,k)+ekm(i-1,j+1,k)+ekm(i-1,j,k) ) emmo = 0.25 * ( & - ekm(i,j,k)+ekm(i,jm,k)+ekm(i-1,jm,k)+ekm(i-1,j,k) ) + ekm(i,j,k)+ekm(i,j-1,k)+ekm(i-1,j-1,k)+ekm(i-1,j,k) ) a_out(i,j,k) = a_out(i,j,k) & + & ( ekm(i,j,k) * (u0(i+1,j,k)-u0(i,j,k)) & - -ekm(i-1,j,k)* (u0(i,j,k)-u0(i-1,j,k)) ) * 2. * dx2i & + -ekm(i-1,j,k)* (u0(i,j,k)-u0(i-1,j,k)) ) * 2. * dx2i * anis_fac(k) & + & - ( empo * ( (u0(i,jp,k)-u0(i,j,k)) *dyi & - +(v0(i,jp,k)-v0(i-1,jp,k))*dxi) & - -emmo * ( (u0(i,j,k)-u0(i,jm,k)) *dyi & - +(v0(i,j,k)-v0(i-1,j,k)) *dxi) ) / dy & + ( empo * ( (u0(i,j+1,k)-u0(i,j,k)) *dyi & + +(v0(i,j+1,k)-v0(i-1,j+1,k))*dxi) & + -emmo * ( (u0(i,j,k)-u0(i,j-1,k)) *dyi & + +(v0(i,j,k)-v0(i-1,j,k)) *dxi) ) / dy * anis_fac(k) & + & - ( rhobh(kp)/rhobf(k) * emop * ( (u0(i,j,kp)-u0(i,j,k)) /dzh(kp) & - +(w0(i,j,kp)-w0(i-1,j,kp))*dxi) & - - rhobh(k)/rhobf(k) * emom * ( (u0(i,j,k)-u0(i,j,km)) /dzh(k) & + ( rhobh(k+1)/rhobf(k) * emop * ( (u0(i,j,k+1)-u0(i,j,k)) /dzh(k+1) & + +(w0(i,j,k+1)-w0(i-1,j,k+1))*dxi) & + - rhobh(k)/rhobf(k) * emom * ( (u0(i,j,k)-u0(i,j,k-1)) /dzh(k) & +(w0(i,j,k)-w0(i-1,j,k)) *dxi) ) /dzf(k) end do @@ -690,17 +842,15 @@ subroutine diffu (a_out) ! special treatment for lowest full level: k=1 ! -------------------------------------------- + !$acc parallel loop collapse(2) default(present) private(empo, emmo, emop, ucu, upcu, fu) do j=2,j1 - jp = j+1 - jm = j-1 - do i=2,i1 empo = 0.25 * ( & - ekm(i,j,1)+ekm(i,jp,1)+ekm(i-1,jp,1)+ekm(i-1,j,1) ) + ekm(i,j,1)+ekm(i,j+1,1)+ekm(i-1,j+1,1)+ekm(i-1,j,1) ) emmo = 0.25 * ( & - ekm(i,j,1)+ekm(i,jm,1)+ekm(i-1,jm,1)+ekm(i-1,j,1) ) + ekm(i,j,1)+ekm(i,j-1,1)+ekm(i-1,j-1,1)+ekm(i-1,j,1) ) emop = ( dzf(2) * ( ekm(i,j,1) + ekm(i-1,j,1) ) + & dzf(1) * ( ekm(i,j,2) + ekm(i-1,j,2) ) ) / & @@ -709,26 +859,29 @@ subroutine diffu (a_out) ucu = 0.5*(u0(i,j,1)+u0(i+1,j,1))+cu - if(ucu >= 0.) then - upcu = max(ucu,1.e-10) - else - upcu = min(ucu,-1.e-10) - end if - + !if(ucu >= 0.) then + ! upcu = max(ucu,1.e-10) + !else + ! upcu = min(ucu,-1.e-10) + !end if + + ! Branchless version of the algorithm above, + ! may or may not be more efficient + upcu = sign(1.,ucu) * max(abs(ucu),1.e-10) fu = ( 0.5*( ustar(i,j)+ustar(i-1,j) ) )**2 * & upcu/sqrt(upcu**2 + & - ((v0(i,j,1)+v0(i-1,j,1)+v0(i,jp,1)+v0(i-1,jp,1))/4.+cv)**2) + ((v0(i,j,1)+v0(i-1,j,1)+v0(i,j+1,1)+v0(i-1,j+1,1))/4.+cv)**2) a_out(i,j,1) = a_out(i,j,1) & + & ( ekm(i,j,1) * (u0(i+1,j,1)-u0(i,j,1)) & - -ekm(i-1,j,1)* (u0(i,j,1)-u0(i-1,j,1)) ) * 2. * dx2i & + -ekm(i-1,j,1)* (u0(i,j,1)-u0(i-1,j,1)) ) * 2. * dx2i * anis_fac(1) & + & - ( empo * ( (u0(i,jp,1)-u0(i,j,1)) *dyi & - +(v0(i,jp,1)-v0(i-1,jp,1))*dxi) & - -emmo * ( (u0(i,j,1)-u0(i,jm,1)) *dyi & - +(v0(i,j,1)-v0(i-1,j,1)) *dxi) ) / dy & + ( empo * ( (u0(i,j+1,1)-u0(i,j,1)) *dyi & + +(v0(i,j+1,1)-v0(i-1,j+1,1))*dxi) & + -emmo * ( (u0(i,j,1)-u0(i,j-1,1)) *dyi & + +(v0(i,j,1)-v0(i-1,j,1)) *dxi) ) / dy * anis_fac(1) & + & ( rhobh(2)/rhobf(1) * emop * ( (u0(i,j,2)-u0(i,j,1)) /dzh(2) & +(w0(i,j,2)-w0(i-1,j,2)) *dxi) & @@ -751,47 +904,42 @@ subroutine diffv (a_out) real(field_r), intent(inout) :: a_out(2-ih:i1+ih,2-jh:j1+jh,k1) real :: emmo, eomm,eomp,epmo real :: fv, vcv,vpcv - integer :: i,j,k,jm,jp,km,kp + integer :: i,j,k + !$acc parallel loop collapse(3) default(present) private(eomm, eomp, emmo, epmo) do k=2,kmax - kp=k+1 - km=k-1 - do j=2,j1 - jp=j+1 - jm=j-1 - do i=2,i1 - eomm = ( dzf(km) * ( ekm(i,j,k) + ekm(i,jm,k) ) + & - dzf(k) * ( ekm(i,j,km) + ekm(i,jm,km) ) ) / & + eomm = ( dzf(k-1) * ( ekm(i,j,k) + ekm(i,j-1,k) ) + & + dzf(k) * ( ekm(i,j,k-1) + ekm(i,j-1,k-1) ) ) / & ( 4. * dzh(k) ) - eomp = ( dzf(kp) * ( ekm(i,j,k) + ekm(i,jm,k) ) + & - dzf(k) * ( ekm(i,j,kp) + ekm(i,jm,kp) ) ) / & - ( 4. * dzh(kp) ) + eomp = ( dzf(k+1) * ( ekm(i,j,k) + ekm(i,j-1,k) ) + & + dzf(k) * ( ekm(i,j,k+1) + ekm(i,j-1,k+1) ) ) / & + ( 4. * dzh(k+1) ) emmo = 0.25 * ( & - ekm(i,j,k)+ekm(i,jm,k)+ekm(i-1,jm,k)+ekm(i-1,j,k) ) + ekm(i,j,k)+ekm(i,j-1,k)+ekm(i-1,j-1,k)+ekm(i-1,j,k) ) epmo = 0.25 * ( & - ekm(i,j,k)+ekm(i,jm,k)+ekm(i+1,jm,k)+ekm(i+1,j,k) ) + ekm(i,j,k)+ekm(i,j-1,k)+ekm(i+1,j-1,k)+ekm(i+1,j,k) ) a_out(i,j,k) = a_out(i,j,k) & + & ( epmo * ( (v0(i+1,j,k)-v0(i,j,k)) *dxi & - +(u0(i+1,j,k)-u0(i+1,jm,k))*dyi) & + +(u0(i+1,j,k)-u0(i+1,j-1,k))*dyi) & -emmo * ( (v0(i,j,k)-v0(i-1,j,k)) *dxi & - +(u0(i,j,k)-u0(i,jm,k)) *dyi) ) / dx & + +(u0(i,j,k)-u0(i,j-1,k)) *dyi) ) / dx * anis_fac(k) & + & - (ekm(i,j,k) * (v0(i,jp,k)-v0(i,j,k)) & - -ekm(i,jm,k)* (v0(i,j,k)-v0(i,jm,k)) ) * 2. * dy2i & + (ekm(i,j,k) * (v0(i,j+1,k)-v0(i,j,k)) & + -ekm(i,j-1,k)* (v0(i,j,k)-v0(i,j-1,k)) ) * 2. * dy2i * anis_fac(k) & + & - ( rhobh(kp)/rhobf(k) * eomp * ( (v0(i,j,kp)-v0(i,j,k)) /dzh(kp) & - +(w0(i,j,kp)-w0(i,jm,kp)) *dyi) & - - rhobh(k)/rhobf(k) * eomm * ( (v0(i,j,k)-v0(i,j,km)) /dzh(k) & - +(w0(i,j,k)-w0(i,jm,k)) *dyi) ) / dzf(k) + ( rhobh(k+1)/rhobf(k) * eomp * ( (v0(i,j,k+1)-v0(i,j,k)) /dzh(k+1) & + +(w0(i,j,k+1)-w0(i,j-1,k+1)) *dyi) & + - rhobh(k)/rhobf(k) * eomm * ( (v0(i,j,k)-v0(i,j,k-1)) /dzh(k) & + +(w0(i,j,k)-w0(i,j-1,k)) *dyi) ) / dzf(k) end do end do @@ -801,45 +949,47 @@ subroutine diffv (a_out) ! special treatment for lowest full level: k=1 ! -------------------------------------------- + !$acc parallel loop collapse(2) default(present) private(emmo, epmo, eomp, vcv, vpcv, fv) do j=2,j1 - jp = j+1 - jm = j-1 do i=2,i1 emmo = 0.25 * ( & - ekm(i,j,1)+ekm(i,jm,1)+ekm(i-1,jm,1)+ekm(i-1,j,1) ) + ekm(i,j,1)+ekm(i,j-1,1)+ekm(i-1,j-1,1)+ekm(i-1,j,1) ) epmo = 0.25 * ( & - ekm(i,j,1)+ekm(i,jm,1)+ekm(i+1,jm,1)+ekm(i+1,j,1) ) + ekm(i,j,1)+ekm(i,j-1,1)+ekm(i+1,j-1,1)+ekm(i+1,j,1) ) - eomp = ( dzf(2) * ( ekm(i,j,1) + ekm(i,jm,1) ) + & - dzf(1) * ( ekm(i,j,2) + ekm(i,jm,2) ) ) / & + eomp = ( dzf(2) * ( ekm(i,j,1) + ekm(i,j-1,1) ) + & + dzf(1) * ( ekm(i,j,2) + ekm(i,j-1,2) ) ) / & ( 4. * dzh(2) ) vcv = 0.5*(v0(i,j,1)+v0(i,j+1,1))+cv - if(vcv >= 0.) then - vpcv = max(vcv,1.e-10) - else - vpcv = min(vcv,-1.e-10) - end if + + !if(vcv >= 0.) then + ! vpcv = max(vcv,1.e-10) + !else + ! vpcv = min(vcv,-1.e-10) + !end if + + vpcv = sign(1., vcv) * max(abs(vcv),1.e-10) fv = ( 0.5*( ustar(i,j)+ustar(i,j-1) ) )**2 * & vpcv/sqrt(vpcv**2 + & - ((u0(i,j,1)+u0(i+1,j,1)+u0(i,jm,1)+u0(i+1,jm,1))/4.+cu)**2) + ((u0(i,j,1)+u0(i+1,j,1)+u0(i,j-1,1)+u0(i+1,j-1,1))/4.+cu)**2) a_out(i,j,1) = a_out(i,j,1) & + & ( epmo * ( (v0(i+1,j,1)-v0(i,j,1)) *dxi & - +(u0(i+1,j,1)-u0(i+1,jm,1))*dyi) & + +(u0(i+1,j,1)-u0(i+1,j-1,1))*dyi) & -emmo * ( (v0(i,j,1)-v0(i-1,j,1)) *dxi & - +(u0(i,j,1)-u0(i,jm,1)) *dyi) ) / dx & + +(u0(i,j,1)-u0(i,j-1,1)) *dyi) ) / dx * anis_fac(1) & + & - ( ekm(i,j,1) * (v0(i,jp,1)-v0(i,j,1)) & - -ekm(i,jm,1)* (v0(i,j,1)-v0(i,jm,1)) ) * 2. * dy2i & + ( ekm(i,j,1) * (v0(i,j+1,1)-v0(i,j,1)) & + -ekm(i,j-1,1)* (v0(i,j,1)-v0(i,j-1,1)) ) * 2. * dy2i * anis_fac(1) & + & ( rhobh(2)/rhobf(1) * eomp * ( (v0(i,j,2)-v0(i,j,1)) /dzh(2) & - +(w0(i,j,2)-w0(i,jm,2)) *dyi) & + +(w0(i,j,2)-w0(i,j-1,2)) *dyi) & -rhobh(1)/rhobf(1)*fv ) / dzf(1) end do @@ -859,47 +1009,44 @@ subroutine diffw(a_out) real(field_r), intent(inout) :: a_out(2-ih:i1+ih,2-jh:j1+jh,k1) real :: emom, eomm, eopm, epom - integer :: i,j,k,jm,jp,km,kp - + integer :: i,j,k + + !$acc parallel loop collapse(3) default(present) private(emom, eomm, eopm, epom) do k=2,kmax - kp=k+1 - km=k-1 do j=2,j1 - jp=j+1 - jm=j-1 do i=2,i1 - emom = ( dzf(km) * ( ekm(i,j,k) + ekm(i-1,j,k) ) + & - dzf(k) * ( ekm(i,j,km) + ekm(i-1,j,km) ) ) / & + emom = ( dzf(k-1) * ( ekm(i,j,k) + ekm(i-1,j,k) ) + & + dzf(k) * ( ekm(i,j,k-1) + ekm(i-1,j,k-1) ) ) / & ( 4. * dzh(k) ) - eomm = ( dzf(km) * ( ekm(i,j,k) + ekm(i,jm,k) ) + & - dzf(k) * ( ekm(i,j,km) + ekm(i,jm,km) ) ) / & + eomm = ( dzf(k-1) * ( ekm(i,j,k) + ekm(i,j-1,k) ) + & + dzf(k) * ( ekm(i,j,k-1) + ekm(i,j-1,k-1) ) ) / & ( 4. * dzh(k) ) - eopm = ( dzf(km) * ( ekm(i,j,k) + ekm(i,jp,k) ) + & - dzf(k) * ( ekm(i,j,km) + ekm(i,jp,km) ) ) / & + eopm = ( dzf(k-1) * ( ekm(i,j,k) + ekm(i,j+1,k) ) + & + dzf(k) * ( ekm(i,j,k-1) + ekm(i,j+1,k-1) ) ) / & ( 4. * dzh(k) ) - epom = ( dzf(km) * ( ekm(i,j,k) + ekm(i+1,j,k) ) + & - dzf(k) * ( ekm(i,j,km) + ekm(i+1,j,km) ) ) / & + epom = ( dzf(k-1) * ( ekm(i,j,k) + ekm(i+1,j,k) ) + & + dzf(k) * ( ekm(i,j,k-1) + ekm(i+1,j,k-1) ) ) / & ( 4. * dzh(k) ) a_out(i,j,k) = a_out(i,j,k) & + & ( epom * ( (w0(i+1,j,k)-w0(i,j,k)) *dxi & - +(u0(i+1,j,k)-u0(i+1,j,km)) /dzh(k) ) & + +(u0(i+1,j,k)-u0(i+1,j,k-1)) /dzh(k) ) & -emom * ( (w0(i,j,k)-w0(i-1,j,k)) *dxi & - +(u0(i,j,k)-u0(i,j,km)) /dzh(k) ))/dx & + +(u0(i,j,k)-u0(i,j,k-1)) /dzh(k) ))/dx * anis_fac(k) & + & - ( eopm * ( (w0(i,jp,k)-w0(i,j,k)) *dyi & - +(v0(i,jp,k)-v0(i,jp,km)) /dzh(k) ) & - -eomm * ( (w0(i,j,k)-w0(i,jm,k)) *dyi & - +(v0(i,j,k)-v0(i,j,km)) /dzh(k) ))/dy & + ( eopm * ( (w0(i,j+1,k)-w0(i,j,k)) *dyi & + +(v0(i,j+1,k)-v0(i,j+1,k-1)) /dzh(k) ) & + -eomm * ( (w0(i,j,k)-w0(i,j-1,k)) *dyi & + +(v0(i,j,k)-v0(i,j,k-1)) /dzh(k) ))/dy * anis_fac(k) & + (1./rhobh(k))*& - ( rhobf(k) * ekm(i,j,k) * (w0(i,j,kp)-w0(i,j,k)) /dzf(k) & - - rhobf(km) * ekm(i,j,km)* (w0(i,j,k)-w0(i,j,km)) /dzf(km) ) * 2. & + ( rhobf(k) * ekm(i,j,k) * (w0(i,j,k+1)-w0(i,j,k)) /dzf(k) & + - rhobf(k-1) * ekm(i,j,k-1)* (w0(i,j,k)-w0(i,j,k-1)) /dzf(k-1) ) * 2. & / dzh(k) end do diff --git a/src/modsubgriddata.f90 b/src/modsubgriddata.f90 index 81569c36..8f8ad4fb 100644 --- a/src/modsubgriddata.f90 +++ b/src/modsubgriddata.f90 @@ -35,6 +35,8 @@ module modsubgriddata logical :: ldelta = .false. !< switch for subgrid length formulation (on/off) logical :: lmason = .false. !< switch for decreased length scale near the surface logical :: lsmagorinsky = .false. !< switch for smagorinsky subgrid scheme + logical :: lanisotrop = .false. !< switch for anisotropic diffusion + real :: cf = 2.5 !< filter constant real :: Rigc = 0.25 !< critical Richardson number real :: Prandtl = (1.0/3.0) @@ -42,6 +44,7 @@ module modsubgriddata real :: cn = 0.76 real :: ch1 = 1. real :: ch2 = 2. + real :: ch ! ch = ch1 + ch2 real :: ce1 = 0.19 real :: ce2 = 0.51 real :: cs = -1 @@ -60,5 +63,6 @@ module modsubgriddata real, allocatable :: csz(:) !< Smagorinsky constant + real, allocatable :: anis_fac(:) !< grid anisotropy factor end module modsubgriddata