From 47ac7fa6fca4dc35e0de0853140ce7e30faca0bb Mon Sep 17 00:00:00 2001
From: Fredrik Jansson <fjansson@abo.fi>
Date: Thu, 8 Jun 2023 14:40:16 +0200
Subject: [PATCH 01/19] modforces: forces and lstend cleanup and optimization

i,j,n loops as vector expressions. Avoids short inner loop over n.
Remove data dependencies for better automatic OpenMP parallelization with Fujitsu compiler.
In lstend, merge k=1 and k>1 cases.
---
 src/modforces.f90 | 154 +++++++++++++---------------------------------
 1 file changed, 43 insertions(+), 111 deletions(-)

diff --git a/src/modforces.f90 b/src/modforces.f90
index 93d33f4e0..09bbe3226 100644
--- a/src/modforces.f90
+++ b/src/modforces.f90
@@ -63,69 +63,36 @@ subroutine forces
   use modglobal, only : i1,j1,kmax,dzh,dzf,grav, lpressgrad
   use modfields, only : sv0,up,vp,wp,thv0h,dpdxl,dpdyl,thvh
   use moduser,   only : force_user
-  use modmicrodata, only : imicro, imicro_bulk, imicro_bin, imicro_sice,iqr
+  use modmicrodata, only : imicro, imicro_bulk, imicro_bin, imicro_sice, imicro_sice2, iqr
   implicit none
 
-  integer i, j, k, jm, jp, km, kp
+  integer i, j, k
 
   if (lforce_user) call force_user
 
-  if((imicro==imicro_sice).or.(imicro==imicro_bulk).or.(imicro==imicro_bin)) then
+  if (lpressgrad) then
+     do k=1,kmax
+        up(:,:,k) = up(:,:,k) - dpdxl(k)      !RN LS pressure gradient force in x,y directions;
+        vp(:,:,k) = vp(:,:,k) - dpdyl(k)
+     end do
+  end if
+
+  if((imicro==imicro_sice).or.(imicro==imicro_sice2).or.(imicro==imicro_bulk).or.(imicro==imicro_bin)) then
     do k=2,kmax
-      kp=k+1
-      km=k-1
-    do j=2,j1
-      jp=j+1
-      jm=j-1
-    do i=2,i1
-    
-    if (lpressgrad) then
-      up(i,j,k) = up(i,j,k) - dpdxl(k)      !RN LS pressure gradient force in x,y directions;
-      vp(i,j,k) = vp(i,j,k) - dpdyl(k)
-    end if
-      wp(i,j,k) = wp(i,j,k) + grav*(thv0h(i,j,k)-thvh(k))/thvh(k) - &
-                  grav*(sv0(i,j,k,iqr)*dzf(k-1)+sv0(i,j,k-1,iqr)*dzf(k))/(2.0*dzh(k))
-    end do
-    end do
+       wp(:,:,k) = wp(:,:,k) + grav*(thv0h(:,:,k)-thvh(k))/thvh(k) - &
+                  grav*(sv0(:,:,k,iqr)*dzf(k-1)+sv0(:,:,k-1,iqr)*dzf(k))/(2.0*dzh(k))
     end do
   else
     do k=2,kmax
-      kp=k+1
-      km=k-1
-    do j=2,j1
-      jp=j+1
-      jm=j-1
-    do i=2,i1
-      up(i,j,k) = up(i,j,k) - dpdxl(k)
-      vp(i,j,k) = vp(i,j,k) - dpdyl(k)
-      wp(i,j,k) = wp(i,j,k) + grav*(thv0h(i,j,k)-thvh(k))/thvh(k)
-    end do
-    end do
+      wp(:,:,k) = wp(:,:,k) + grav*(thv0h(:,:,k)-thvh(k))/thvh(k)
     end do
   end if
 
 !     --------------------------------------------
 !     special treatment for lowest full level: k=1
 !     --------------------------------------------
+  wp(:,:,1) = 0.0
 
-  do j=2,j1
-    jp = j+1
-    jm = j-1
-  do i=2,i1
-
-    if (lpressgrad) then
-      up(i,j,1) = up(i,j,1) - dpdxl(1)
-      vp(i,j,1) = vp(i,j,1) - dpdyl(1)
-    end if
-
-    wp(i,j,1) = 0.0
-
-  end do
-  end do
-!     ----------------------------------------------end i,j-loop
-
-
-  return
   end subroutine forces
   subroutine coriolis
 
@@ -152,7 +119,7 @@ subroutine coriolis
   integer i, j, k, jm, jp, km, kp
 
   if (lcoriol .eqv. .false.) return
-  
+
   do k=2,kmax
     kp=k+1
     km=k-1
@@ -232,80 +199,45 @@ subroutine lstend
                         dqtdtls, dthldtls, dudtls, dvdtls
   implicit none
 
-  integer i,j,k,n,kp,km
-  real subs_thl,subs_qt,subs_sv,subs_u,subs_v
+  integer k,kp,km
 
 !     1. DETERMINE LARGE SCALE TENDENCIES
 !        --------------------------------
 
 !     1.1 lowest model level above surface : only downward component
-
-  subs_thl = 0.
-  subs_qt  = 0.
-  subs_sv  = 0.
-  subs_u   = 0.
-  subs_v   = 0.
-
-  do j=2,j1
-    do i=2,i1
-      k = 1
-      if (whls(2).lt.0) then !neglect effect of mean ascending on tendencies at the lowest full level
-        subs_thl     = 0.5*whls(2)  *(thl0(i,j,2)-thl0(i,j,1))/dzh(2)
-        subs_qt      = 0.5*whls(2)  *(qt0(i,j,2)-qt0(i,j,1) )/dzh(2)
-        if (lmomsubs) then
-          subs_u     = 0.5*whls(2)  *(u0(i,j,2)-u0(i,j,1))/dzh(2)
-          subs_v     = 0.5*whls(2)  *(v0(i,j,2)-v0(i,j,1))/dzh(2)
-        endif
-        do n=1,nsv
-          subs_sv =  0.5*whls(2)  *(sv0(i,j,2,n)-sv0(i,j,1,n)  )/dzh(2)
-          svp(i,j,1,n) = svp(i,j,1,n)-subs_sv
-        enddo
-      endif
-      thlp(i,j,1) = thlp(i,j,1) -u0av(1)*dthldxls(1)-v0av(1)*dthldyls(1)-subs_thl + dthldtls(1)
-      qtp(i,j,1)  = qtp (i,j,1) -u0av(1)*dqtdxls (1)-v0av(1)*dqtdyls (1)-subs_qt  + dqtdtls(1)
-      up  (i,j,1) = up  (i,j,1) -u0av(1)*dudxls  (1)-v0av(1)*dudyls  (1)-subs_u   + dudtls(1)
-      vp  (i,j,1) = vp  (i,j,1) -u0av(1)*dvdxls  (1)-v0av(1)*dvdyls  (1)-subs_v   + dvdtls(1)
-    end do
-  end do
-
 !     1.2 other model levels twostream
 
-  do k=2,kmax
+  do k=1,kmax
     kp=k+1
     km=k-1
-    do j=2,j1
-      do i=2,i1
-        if (whls(kp).lt.0) then   !downwind scheme for subsidence
-          subs_thl    = whls(kp) * (thl0(i,j,kp) - thl0(i,j,k))/dzh(kp)
-          subs_qt     = whls(kp) * (qt0 (i,j,kp) - qt0 (i,j,k))/dzh(kp)
-          if (lmomsubs) then
-            subs_u    = whls(kp) * (u0(i,j,kp) - u0(i,j,k))/dzh(kp)
-            subs_v    = whls(kp) * (v0(i,j,kp) - v0(i,j,k))/dzh(kp)
-          endif
-          do n=1,nsv
-            subs_sv   = whls(kp)  *(sv0(i,j,kp,n) - sv0(i,j,k,n))/dzh(kp)
-            svp(i,j,k,n) = svp(i,j,k,n)-subs_sv
-          enddo
-        else !downwind scheme for mean upward motions
-          subs_thl    = whls(k) * (thl0(i,j,k) - thl0(i,j,km))/dzh(k)
-          subs_qt     = whls(k) * (qt0 (i,j,k) - qt0 (i,j,km))/dzh(k)
+
+    if (whls(kp).lt.0) then   !downwind scheme for subsidence
+       thlp(2:i1,2:j1,k) = thlp(2:i1,2:j1,k) - whls(kp) * (thl0(2:i1,2:j1,kp) - thl0(2:i1,2:j1,k))/dzh(kp)
+       qtp (2:i1,2:j1,k) = qtp (2:i1,2:j1,k) - whls(kp) * (qt0 (2:i1,2:j1,kp) - qt0 (2:i1,2:j1,k))/dzh(kp)
+       if (lmomsubs) then
+          up(2:i1,2:j1,k) = up(2:i1,2:j1,k) - whls(kp) * (u0(2:i1,2:j1,kp) - u0(2:i1,2:j1,k))/dzh(kp)
+          vp(2:i1,2:j1,k) = vp(2:i1,2:j1,k) - whls(kp) * (v0(2:i1,2:j1,kp) - v0(2:i1,2:j1,k))/dzh(kp)
+       endif
+
+       svp(2:i1,2:j1,k,:) = svp(2:i1,2:j1,k,:) - whls(kp) * (sv0(2:i1,2:j1,kp,:) - sv0(2:i1,2:j1,k,:))/dzh(kp)
+
+    else !downwind scheme for mean upward motions
+       if (k > 1) then !neglect effect of mean ascending on tendencies at the lowest full level
+          thlp(2:i1,2:j1,k) = thlp(2:i1,2:j1,k) - whls(k) * (thl0(2:i1,2:j1,k) - thl0(2:i1,2:j1,km))/dzh(k)
+          qtp (2:i1,2:j1,k) = qtp (2:i1,2:j1,k) - whls(k) * (qt0 (2:i1,2:j1,k) - qt0 (2:i1,2:j1,km))/dzh(k)
           if (lmomsubs) then
-            subs_u    = whls(k) * (u0(i,j,k) - u0(i,j,km))/dzh(k)
-            subs_v    = whls(k) * (v0(i,j,k) - v0(i,j,km))/dzh(k)
+             up(2:i1,2:j1,k) = up(2:i1,2:j1,k) - whls(k) * (u0(2:i1,2:j1,k) - u0(2:i1,2:j1,km))/dzh(k)
+             vp(2:i1,2:j1,k) = vp(2:i1,2:j1,k) - whls(k) * (v0(2:i1,2:j1,k) - v0(2:i1,2:j1,km))/dzh(k)
           endif
-          do n=1,nsv
-            subs_sv   = whls(k) * (sv0(i,j,k,n) - sv0(i,j,km,n))/dzh(k)
-            svp(i,j,k,n) = svp(i,j,k,n)-subs_sv
-          enddo
-        endif
-    
-        thlp(i,j,k) = thlp(i,j,k)-u0av(k)*dthldxls(k)-v0av(k)*dthldyls(k)-subs_thl + dthldtls(k)
-        qtp (i,j,k) = qtp (i,j,k)-u0av(k)*dqtdxls (k)-v0av(k)*dqtdyls (k)-subs_qt  + dqtdtls(k)
-        up  (i,j,k) = up  (i,j,k)-u0av(k)*dudxls  (k)-v0av(k)*dudyls  (k)-subs_u   + dudtls(k)
-        vp  (i,j,k) = vp  (i,j,k)-u0av(k)*dvdxls  (k)-v0av(k)*dvdyls  (k)-subs_v   + dvdtls(k)
-
-      enddo
-    enddo
+          svp(2:i1,2:j1,k,:) = svp(2:i1,2:j1,k,:)-whls(k) * (sv0(2:i1,2:j1,k,:) - sv0(2:i1,2:j1,km,:))/dzh(k)
+       endif
+    endif
+
+    thlp(2:i1,2:j1,k) = thlp(2:i1,2:j1,k)-u0av(k)*dthldxls(k)-v0av(k)*dthldyls(k) + dthldtls(k)
+    qtp (2:i1,2:j1,k) = qtp (2:i1,2:j1,k)-u0av(k)*dqtdxls (k)-v0av(k)*dqtdyls (k) + dqtdtls(k)
+    up  (2:i1,2:j1,k) = up  (2:i1,2:j1,k)-u0av(k)*dudxls  (k)-v0av(k)*dudyls  (k) + dudtls(k)
+    vp  (2:i1,2:j1,k) = vp  (2:i1,2:j1,k)-u0av(k)*dvdxls  (k)-v0av(k)*dvdyls  (k) + dvdtls(k)
+
   enddo
 
   return

From 3fe0063101f57f0a66283cd951f72a5aa1ca0270 Mon Sep 17 00:00:00 2001
From: Fredrik Jansson <fjansson@abo.fi>
Date: Tue, 17 Jan 2023 14:29:18 +0100
Subject: [PATCH 02/19] Put sgs_surface_fix back in NAMSUBGRID namelist

The fix can still be required to avoid too high TKE production at the surface.
The default is still false.
---
 src/modsubgrid.f90 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/modsubgrid.f90 b/src/modsubgrid.f90
index ab5598061..a407ba5d1 100644
--- a/src/modsubgrid.f90
+++ b/src/modsubgrid.f90
@@ -107,7 +107,7 @@ subroutine subgridnamelist
     integer :: ierr
 
     namelist/NAMSUBGRID/ &
-        ldelta,lmason,cf,cn,Rigc,Prandtl,lsmagorinsky,cs,nmason,ch1
+        ldelta,lmason,cf,cn,Rigc,Prandtl,lsmagorinsky,cs,nmason,sgs_surface_fix,ch1
 
     if(myid==0)then
       open(ifnamopt,file=fname_options,status='old',iostat=ierr)

From 450422e835add2ab7a49f5e613b8a13e1fba8b76 Mon Sep 17 00:00:00 2001
From: Fredrik Jansson <fjansson@abo.fi>
Date: Wed, 7 Jun 2023 15:17:18 +0200
Subject: [PATCH 03/19] modsubgrid: simplify subgrid TKE scheme and add
 anisotropic diffusion option

Move subgrid scheme selection out of for loops for clarity and better vectorization.

To enable anisotropic diffusion, set lanisotrop = .true. in NAMSUBGRID namelist.
Anisotropic diffusion implemented by Stephan de Roode. The anisotropic scheme and
the effects of using it are documented here:
de Roode, S. R., Siebesma, A. P., Jansson, F., & Janssens, M.
Dependency of mesoscale organization on grid anisotropy
in large-eddy simulations of convective boundary layers at Gray Zone resolutions.
JAMES, 14, e2022MS003095 (2022)
https://doi.org/10.1029/2022MS003095
---
 src/modbudget.f90      |  22 +++---
 src/modglobal.f90      |   4 +-
 src/modsubgrid.f90     | 169 ++++++++++++++++++++++++++++-------------
 src/modsubgriddata.f90 |   4 +
 4 files changed, 133 insertions(+), 66 deletions(-)

diff --git a/src/modbudget.f90 b/src/modbudget.f90
index b815ea244..1a0fa194f 100644
--- a/src/modbudget.f90
+++ b/src/modbudget.f90
@@ -213,7 +213,7 @@ subroutine do_genbudget
                           ijtot,cu,cv,grav, &
                           dxi,dyi,dx2i,dy2i
     use modsurfdata,only : ustar
-    use modsubgriddata, only : ekm
+    use modsubgriddata, only : ekm,anis_fac
     use modpois,    only : p
     use modfields,  only : u0,v0,w0,thv0h,u0av,v0av,rhobf,rhobh,thvh
 !cstep    use modtilt,    only : adjustbudget,ltilted
@@ -407,12 +407,12 @@ subroutine do_genbudget
 
       tau1j(i,j,k) = &
                ( ekm(i,j,k)  * (u0(i+1,j,k)-u0(i,j,k)) &
-                -ekm(i-1,j,k)* (u0(i,j,k)-u0(i-1,j,k)) ) * 2. * dx2i &
+                -ekm(i-1,j,k)* (u0(i,j,k)-u0(i-1,j,k)) ) * 2. * dx2i * anis_fac(k) &
                + &
                ( empo * ( (u0(i,jp,k)-u0(i,j,k))   *dyi &
                          +(v0(i,jp,k)-v0(i-1,jp,k))*dxi) &
                 -emmo * ( (u0(i,j,k)-u0(i,jm,k))   *dyi &
-                         +(v0(i,j,k)-v0(i-1,j,k))  *dxi) ) * dyi &
+			+(v0(i,j,k)-v0(i-1,j,k))  *dxi) ) * dyi * anis_fac(k) &
                + &
                ( rhobh(kp)/rhobf(k) * emop * ( (u0(i,j,kp)-u0(i,j,k))   /dzh(kp) &
                          +(w0(i,j,kp)-w0(i-1,j,kp))*dxi) &
@@ -455,10 +455,10 @@ subroutine do_genbudget
                ( epmo * ( (v0(i+1,j,k)-v0(i,j,k))   *dxi &
                          +(u0(i+1,j,k)-u0(i+1,jm,k))*dyi) &
                 -emmo * ( (v0(i,j,k)-v0(i-1,j,k))   *dxi &
-                         +(u0(i,j,k)-u0(i,jm,k))    *dyi)  ) * dxi &
+                         +(u0(i,j,k)-u0(i,jm,k))    *dyi)  ) * dxi * anis_fac(k) &
                + &
                ( ekm(i,j,k) * (v0(i,jp,k)-v0(i,j,k)) &
-                -ekm(i,jm,k)* (v0(i,j,k)-v0(i,jm,k))  ) * 2. * dy2i &
+                -ekm(i,jm,k)* (v0(i,j,k)-v0(i,jm,k))  ) * 2. * dy2i * anis_fac(k) &
                + &
                ( rhobh(kp)/rhobf(k) * eomp * ( (v0(i,j,kp)-v0(i,j,k))    /dzh(kp) &
                          +(w0(i,j,kp)-w0(i,jm,kp))  *dyi) &
@@ -501,12 +501,12 @@ subroutine do_genbudget
                  ( epom * ( (w0(i+1,j,k)-w0(i,j,k))    *dxi &
                            +(u0(i+1,j,k)-u0(i+1,j,km)) /dzh(k) ) &
                   -emom * ( (w0(i,j,k)-w0(i-1,j,k))    *dxi &
-                           +(u0(i,j,k)-u0(i,j,km))     /dzh(k) ))*dxi &
+                           +(u0(i,j,k)-u0(i,j,km))     /dzh(k) ))*dxi * anis_fac(k) &
                  + &
                  ( eopm * ( (w0(i,jp,k)-w0(i,j,k))     *dyi &
                            +(v0(i,jp,k)-v0(i,jp,km))   /dzh(k) ) &
                   -eomm * ( (w0(i,j,k)-w0(i,jm,k))     *dyi &
-                           +(v0(i,j,k)-v0(i,j,km))     /dzh(k) ))*dyi &
+                           +(v0(i,j,k)-v0(i,j,km))     /dzh(k) ))*dyi * anis_fac(k) &
                  + &
                   ( rhobf(k)/rhobh(k) * ekm(i,j,k) * (w0(i,j,kp)-w0(i,j,k)) /dzf(k) &
                   - rhobf(km)/rhobh(k) * ekm(i,j,km)* (w0(i,j,k)-w0(i,j,km)) /dzf(km) ) * 2. &
@@ -553,12 +553,12 @@ subroutine do_genbudget
 
     tau1j(i,j,1) = &
              ( ekm(i,j,1)  * (u0(i+1,j,1)-u0(i,j,1)) &
-              -ekm(i-1,j,1)* (u0(i,j,1)-u0(i-1,j,1)) ) * 2. * dx2i &
+              -ekm(i-1,j,1)* (u0(i,j,1)-u0(i-1,j,1)) ) * 2. * dx2i * anis_fac(k) &
              + &
              ( empo * ( (u0(i,jp,1)-u0(i,j,1))   *dyi &
                        +(v0(i,jp,1)-v0(i-1,jp,1))*dxi) &
              - emmo * ( (u0(i,j,1)-u0(i,jm,1))   *dyi &
-                       +(v0(i,j,1)-v0(i-1,j,1))  *dxi)   ) * dyi &
+                       +(v0(i,j,1)-v0(i-1,j,1))  *dxi)   ) * dyi * anis_fac(k) &
              + &
              ( rhobh(2)/rhobf(1) * emop * ( (u0(i,j,2)-u0(i,j,1))    /dzh(2) &
                        +(w0(i,j,2)-w0(i-1,j,2))  *dxi) &
@@ -584,10 +584,10 @@ subroutine do_genbudget
             ( epmo * ( (v0(i+1,j,1)-v0(i,j,1))   *dxi &
                       +(u0(i+1,j,1)-u0(i+1,jm,1))*dyi) &
              -emmo * ( (v0(i,j,1)-v0(i-1,j,1))   *dxi &
-                      +(u0(i,j,1)-u0(i,jm,1))    *dyi)   ) * dxi &
+                      +(u0(i,j,1)-u0(i,jm,1))    *dyi)   ) * dxi * anis_fac(k) &
             + &
             ( ekm(i,j,1) * (v0(i,jp,1)-v0(i,j,1)) &
-             -ekm(i,jm,1)* (v0(i,j,1)-v0(i,jm,1))  ) * 2. * dy2i &
+             -ekm(i,jm,1)* (v0(i,j,1)-v0(i,jm,1))  ) * 2. * dy2i * anis_fac(k) &
             + &
             ( rhobh(2)/rhobf(1) * eomp * ( (v0(i,j,2)-v0(i,j,1))     /dzh(2) &
                       +(w0(i,j,2)-w0(i,jm,2))    *dyi) &
diff --git a/src/modglobal.f90 b/src/modglobal.f90
index e550f64cf..8c594f118 100644
--- a/src/modglobal.f90
+++ b/src/modglobal.f90
@@ -216,7 +216,7 @@ module modglobal
       real :: xsize    = -1 !<  domain size in x-direction
       real :: ysize    = -1 !<  domain size in y-direction
       real, allocatable :: delta(:)       !<  (dx*dy*dz)**(1/3)
-      real, allocatable :: deltai(:)       !<  (dx*dy*dz)**(-1/3)
+      real, allocatable :: deltai(:)       !<  (dx*dy*dz)**(-1/3)  or dzf**-1 for anisotropic diffusion
 
       logical :: leq      = .true.  !<  switch for (non)-equidistant mode.
       logical :: lmomsubs = .false.  !<  switch to apply subsidence on the momentum or not
@@ -433,7 +433,7 @@ subroutine initglobal
     do k=1,k1
 
        delta(k) = (dx*dy*dzf(k))**(1./3.)
-       deltai(k) = 1./delta(k)
+       deltai(k) = 1./delta(k)     !can be overruled in modsubgrid in case anisotropic diffusion is applied
     end do
 
   !--------------------------------------------------
diff --git a/src/modsubgrid.f90 b/src/modsubgrid.f90
index a407ba5d1..a2c485718 100644
--- a/src/modsubgrid.f90
+++ b/src/modsubgrid.f90
@@ -37,14 +37,14 @@ module modsubgrid
 
 contains
   subroutine initsubgrid
-    use modglobal, only : ih,i1,jh,j1,k1,delta,zf,fkar,pi
+    use modglobal, only : ih,i1,jh,j1,k1,delta,deltai,dx,dy,zf,dzf,fkar,pi
     use modmpi, only : myid
 
     implicit none
 
     integer   :: k
 
-    real :: ceps, ch
+    real :: ceps
     real :: mlen
 
     call subgridnamelist
@@ -56,9 +56,12 @@ subroutine initsubgrid
     allocate(sbshr(2-ih:i1+ih,2-jh:j1+jh,1))
     allocate(sbbuo(2-ih:i1+ih,2-jh:j1+jh,1))
     allocate(csz(k1))
+    allocate(anis_fac(k1))
 
     ! Initialize variables to avoid problems when not using subgrid scheme JvdD
+    ! Determination of subgrid constants is explained in De Roode et al. 2017
     ekm=0.; ekh=0.; zlt=0.; sbdiss=0.; sbshr=0.; sbbuo=0.; csz=0.
+    anis_fac = 0.
 
     cm = cf / (2. * pi) * (1.5*alpha_kolm)**(-1.5)
 
@@ -83,6 +86,21 @@ subroutine initsubgrid
       end do
     end if
 
+    if(lanisotrop) then
+       ! Anisotropic diffusion scheme  https://doi.org/10.1029/2022MS003095
+       ! length scale in TKE equation is delta z (private communication with Marat)
+       if ((dx.ne.dy) .and. myid == 0) stop "The anisotropic diffusion assumes dx=dy."
+       do k = 1,k1
+          deltai   (k) = 1./dzf(k)       !overrules deltai (k) = 1/delta(k) as defined in initglobal
+          anis_fac (k) = (dx/dzf(k))**2  !assumes dx=dy. is used to enhance horizontal diffusion
+       end do
+    else
+       do k = 1,k1
+          anis_fac (k) = 1.   !horizontal = vertical diffusion
+       end do
+    endif
+
+
     if (myid==0) then
       write (6,*) 'cf    = ',cf
       write (6,*) 'cm    = ',cm
@@ -107,7 +125,7 @@ subroutine subgridnamelist
     integer :: ierr
 
     namelist/NAMSUBGRID/ &
-        ldelta,lmason,cf,cn,Rigc,Prandtl,lsmagorinsky,cs,nmason,sgs_surface_fix,ch1
+        ldelta,lmason,cf,cn,Rigc,Prandtl,lsmagorinsky,cs,nmason,sgs_surface_fix,ch1,lanisotrop
 
     if(myid==0)then
       open(ifnamopt,file=fname_options,status='old',iostat=ierr)
@@ -115,12 +133,16 @@ subroutine subgridnamelist
       call checknamelisterror(ierr, ifnamopt, 'NAMSUBGRID')
       write(6 ,NAMSUBGRID)
       close(ifnamopt)
+
+      if (lmason .and. .not. ldelta) stop "lmason = .true. requires ldelta = .true."
+      if (lmason .and. lanisotrop) stop "lmason = .true. is not compatible with lanisotropic = .true."
     end if
 
     call D_MPI_BCAST(ldelta          ,1, 0,comm3d,mpierr)
     call D_MPI_BCAST(lmason          ,1, 0,comm3d,mpierr)
     call D_MPI_BCAST(nmason          ,1, 0,comm3d,mpierr)
     call D_MPI_BCAST(lsmagorinsky    ,1, 0,comm3d,mpierr)
+    call D_MPI_BCAST(lanisotrop      ,1, 0,comm3d,mpierr)
     call D_MPI_BCAST(cs              ,1, 0,comm3d,mpierr)
     call D_MPI_BCAST(cf              ,1, 0,comm3d,mpierr)
     call D_MPI_BCAST(cn              ,1, 0,comm3d,mpierr)
@@ -128,7 +150,6 @@ subroutine subgridnamelist
     call D_MPI_BCAST(Prandtl         ,1, 0,comm3d,mpierr)
     call D_MPI_BCAST(sgs_surface_fix ,1, 0,comm3d,mpierr)
     call D_MPI_BCAST(ch1             ,1, 0,comm3d,mpierr)
-
   end subroutine subgridnamelist
 
   subroutine subgrid
@@ -288,37 +309,79 @@ subroutine closure
     end do
 
   ! do TKE scheme
-  else
-    do k=1,kmax
-      do j=2,j1
-        do i=2,i1
-          if (ldelta .or. (dthvdz(i,j,k)<=0)) then
-            zlt(i,j,k) = delta(k)
-            if (lmason) zlt(i,j,k) = (1. / zlt(i,j,k) ** nmason + 1. / ( fkar * (zf(k) + z0m(i,j)))**nmason) ** (-1./nmason)
-            ekm(i,j,k) = cm * zlt(i,j,k) * e120(i,j,k)
-            ekh(i,j,k) = (ch1 + ch2) * ekm(i,j,k)
-
-            ekm(i,j,k) = max(ekm(i,j,k),ekmin)
-            ekh(i,j,k) = max(ekh(i,j,k),ekmin)
-          else
-             ! zlt(i,j,k) = min(delta(k),cn*e120(i,j,k)/sqrt(grav/thvf(k)*abs(dthvdz(i,j,k))))
-             ! faster calculation: evaluate sqrt only if the second argument is actually smaller
-             zlt(i,j,k) = delta(k)
-             if ( grav*abs(dthvdz(i,j,k)) * delta(k)**2 > (cn*e120(i,j,k))**2 * thvf(k) ) then
-                zlt(i,j,k) = cn*e120(i,j,k)/sqrt(grav/thvf(k)*abs(dthvdz(i,j,k)))
-             end if
-             
-            if (lmason) zlt(i,j,k) = (1. / zlt(i,j,k) ** nmason + 1. / ( fkar * (zf(k) + z0m(i,j)))**nmason) ** (-1./nmason)
-
-            ekm(i,j,k) = cm * zlt(i,j,k) * e120(i,j,k)
-            ekh(i,j,k) = (ch1 + ch2 * zlt(i,j,k)*deltai(k)) * ekm(i,j,k)
-
-            ekm(i,j,k) = max(ekm(i,j,k),ekmin)
-            ekh(i,j,k) = max(ekh(i,j,k),ekmin)
-          endif
-        end do
-      end do
-    end do
+ else
+    ! choose one of ldelta, ldelta+lmason, lanisotropic, or none of them for Deardorff length scale adjustment
+    if (ldelta .and. .not. lmason) then
+       do k=1,kmax
+          do j=2,j1
+             do i=2,i1
+                zlt(i,j,k) = delta(k)
+                
+                ekm(i,j,k) = cm * zlt(i,j,k) * e120(i,j,k)
+                ekh(i,j,k) = ch * ekm(i,j,k)
+
+                ekm(i,j,k) = max(ekm(i,j,k),ekmin)
+                ekh(i,j,k) = max(ekh(i,j,k),ekmin)
+             end do
+          end do
+       end do
+    else if (ldelta .and. lmason) then ! delta scheme with Mason length scale correction
+       do k=1,kmax
+          do j=2,j1
+             do i=2,i1
+                zlt(i,j,k) = delta(k)
+                zlt(i,j,k) = (1. / zlt(i,j,k) ** nmason + 1. / ( fkar * (zf(k) + z0m(i,j)))**nmason) ** (-1./nmason)                
+                
+                ekm(i,j,k) = cm * zlt(i,j,k) * e120(i,j,k)
+                ekh(i,j,k) = ch * ekm(i,j,k)
+                            
+                ekm(i,j,k) = max(ekm(i,j,k),ekmin)
+                ekh(i,j,k) = max(ekh(i,j,k),ekmin)
+             end do
+          end do
+       end do
+    else if (lanisotrop) then ! Anisotropic diffusion,  https://doi.org/10.1029/2022MS003095
+       do k=1,kmax
+          do j=2,j1
+             do i=2,i1
+                zlt(i,j,k) = dzf(k)
+                
+                ekm(i,j,k) = cm * zlt(i,j,k) * e120(i,j,k)
+                ekh(i,j,k) = ch * ekm(i,j,k)
+                
+                ekm(i,j,k) = max(ekm(i,j,k),ekmin)
+                ekh(i,j,k) = max(ekh(i,j,k),ekmin)
+             end do
+          end do
+       end do
+    else ! Deardorff lengthscale correction
+       do k=1,kmax
+          do j=2,j1
+             do i=2,i1
+                zlt(i,j,k) = delta(k)
+
+                !original
+                !if (dthvdz(i,j,k) > 0) then
+                !zlt(i,j,k) = min(delta(k),cn*e120(i,j,k)/sqrt(grav/thvf(k)*abs(dthvdz(i,j,k))))
+                !end if
+                
+                ! alternative without if
+                zlt(i,j,k) = min(delta(k), &                                                     
+                     cn*e120(i,j,k) / sqrt( grav/thvf(k) * abs(dthvdz(i,j,k))) + &               
+                     delta(k) * (1.0-sign(1.0_field_r,dthvdz(i,j,k))))                           
+                ! the final line is 0 if dthvdz(i,j,k) > 0, else 2*delta(k)                        
+                ! ensuring that zlt(i,j,k) = delta(k) when dthvdz < 0, as                        
+                ! in the original scheme.            
+                
+                ekm(i,j,k) = cm * zlt(i,j,k) * e120(i,j,k)
+                ekh(i,j,k) = (ch1 + ch2 * zlt(i,j,k)*deltai(k)) * ekm(i,j,k)
+                
+                ekm(i,j,k) = max(ekm(i,j,k),ekmin)
+                ekh(i,j,k) = max(ekh(i,j,k),ekmin)
+             end do
+          end do
+       end do
+    end if
   end if
 
 !*************************************************************
@@ -530,10 +593,10 @@ subroutine diffc (a_in,a_out,flux)
           a_out(i,j,k) = a_out(i,j,k) &
                     +  0.5 * ( &
                   ( (ekh(i+1,j,k)+ekh(i,j,k))*(a_in(i+1,j,k)-a_in(i,j,k)) &
-                    -(ekh(i,j,k)+ekh(i-1,j,k))*(a_in(i,j,k)-a_in(i-1,j,k)))*dx2i &
+                    -(ekh(i,j,k)+ekh(i-1,j,k))*(a_in(i,j,k)-a_in(i-1,j,k)))*dx2i * anis_fac(k) &
                     + &
                   ( (ekh(i,jp,k)+ekh(i,j,k)) *(a_in(i,jp,k)-a_in(i,j,k)) &
-                    -(ekh(i,j,k)+ekh(i,jm,k)) *(a_in(i,j,k)-a_in(i,jm,k)) )*dy2i &
+                    -(ekh(i,j,k)+ekh(i,jm,k)) *(a_in(i,j,k)-a_in(i,jm,k)) )*dy2i * anis_fac(k) &
                   + &
                   ( rhobh(kp)/rhobf(k) * (dzf(kp)*ekh(i,j,k) + dzf(k)*ekh(i,j,kp)) &
                     *  (a_in(i,j,kp)-a_in(i,j,k)) / dzh(kp)**2 &
@@ -552,10 +615,10 @@ subroutine diffc (a_in,a_out,flux)
         a_out(i,j,1) = a_out(i,j,1) &
                   + 0.5 * ( &
                 ( (ekh(i+1,j,1)+ekh(i,j,1))*(a_in(i+1,j,1)-a_in(i,j,1)) &
-                  -(ekh(i,j,1)+ekh(i-1,j,1))*(a_in(i,j,1)-a_in(i-1,j,1)) )*dx2i &
+                  -(ekh(i,j,1)+ekh(i-1,j,1))*(a_in(i,j,1)-a_in(i-1,j,1)) )*dx2i * anis_fac(k) &
                   + &
                 ( (ekh(i,j+1,1)+ekh(i,j,1))*(a_in(i,j+1,1)-a_in(i,j,1)) &
-                  -(ekh(i,j,1)+ekh(i,j-1,1))*(a_in(i,j,1)-a_in(i,j-1,1)) )*dy2i &
+                  -(ekh(i,j,1)+ekh(i,j-1,1))*(a_in(i,j,1)-a_in(i,j-1,1)) )*dy2i * anis_fac(k) &
                   + &
                 ( rhobh(2)/rhobf(1) * (dzf(2)*ekh(i,j,1) + dzf(1)*ekh(i,j,2)) &
                   *  (a_in(i,j,2)-a_in(i,j,1)) / dzh(2)**2 &
@@ -591,10 +654,10 @@ subroutine diffe(a_out)
           a_out(i,j,k) = a_out(i,j,k) &
                   +  ( &
               ((ekm(i+1,j,k)+ekm(i,j,k))*(e120(i+1,j,k)-e120(i,j,k)) &
-              -(ekm(i,j,k)+ekm(i-1,j,k))*(e120(i,j,k)-e120(i-1,j,k)))*dx2i &
+              -(ekm(i,j,k)+ekm(i-1,j,k))*(e120(i,j,k)-e120(i-1,j,k)))*dx2i * anis_fac(k) &
                   + &
               ((ekm(i,jp,k)+ekm(i,j,k)) *(e120(i,jp,k)-e120(i,j,k)) &
-              -(ekm(i,j,k)+ekm(i,jm,k)) *(e120(i,j,k)-e120(i,jm,k)) )*dy2i &
+              -(ekm(i,j,k)+ekm(i,jm,k)) *(e120(i,j,k)-e120(i,jm,k)) )*dy2i * anis_fac(k) &
                   + &
               (rhobh(kp)/rhobf(k) * (dzf(kp)*ekm(i,j,k) + dzf(k)*ekm(i,j,kp)) &
               *(e120(i,j,kp)-e120(i,j,k)) / dzh(kp)**2 &
@@ -615,10 +678,10 @@ subroutine diffe(a_out)
 
         a_out(i,j,1) = a_out(i,j,1) + &
             ( (ekm(i+1,j,1)+ekm(i,j,1))*(e120(i+1,j,1)-e120(i,j,1)) &
-              -(ekm(i,j,1)+ekm(i-1,j,1))*(e120(i,j,1)-e120(i-1,j,1)) )*dx2i &
+              -(ekm(i,j,1)+ekm(i-1,j,1))*(e120(i,j,1)-e120(i-1,j,1)) )*dx2i * anis_fac(k) &
             + &
             ( (ekm(i,j+1,1)+ekm(i,j,1))*(e120(i,j+1,1)-e120(i,j,1)) &
-              -(ekm(i,j,1)+ekm(i,j-1,1))*(e120(i,j,1)-e120(i,j-1,1)) )*dy2i &
+              -(ekm(i,j,1)+ekm(i,j-1,1))*(e120(i,j,1)-e120(i,j-1,1)) )*dy2i * anis_fac(k) &
             + &
               ( rhobh(2)/rhobf(1) * (dzf(2)*ekm(i,j,1) + dzf(1)*ekm(i,j,2)) &
               *  (e120(i,j,2)-e120(i,j,1)) / dzh(2)**2              )/dzf(1)
@@ -670,12 +733,12 @@ subroutine diffu (a_out)
           a_out(i,j,k) = a_out(i,j,k) &
                   + &
                   ( ekm(i,j,k)  * (u0(i+1,j,k)-u0(i,j,k)) &
-                    -ekm(i-1,j,k)* (u0(i,j,k)-u0(i-1,j,k)) ) * 2. * dx2i &
+                    -ekm(i-1,j,k)* (u0(i,j,k)-u0(i-1,j,k)) ) * 2. * dx2i * anis_fac(k) &
                   + &
                   ( empo * ( (u0(i,jp,k)-u0(i,j,k))   *dyi &
                             +(v0(i,jp,k)-v0(i-1,jp,k))*dxi) &
                     -emmo * ( (u0(i,j,k)-u0(i,jm,k))   *dyi &
-                            +(v0(i,j,k)-v0(i-1,j,k))  *dxi)   ) / dy &
+                            +(v0(i,j,k)-v0(i-1,j,k))  *dxi)   ) / dy * anis_fac(k) &
                   + &
                   ( rhobh(kp)/rhobf(k) * emop * ( (u0(i,j,kp)-u0(i,j,k))   /dzh(kp) &
                             +(w0(i,j,kp)-w0(i-1,j,kp))*dxi) &
@@ -723,12 +786,12 @@ subroutine diffu (a_out)
         a_out(i,j,1) = a_out(i,j,1) &
                 + &
               ( ekm(i,j,1)  * (u0(i+1,j,1)-u0(i,j,1)) &
-              -ekm(i-1,j,1)* (u0(i,j,1)-u0(i-1,j,1)) ) * 2. * dx2i &
+              -ekm(i-1,j,1)* (u0(i,j,1)-u0(i-1,j,1)) ) * 2. * dx2i * anis_fac(k) &
                 + &
               ( empo * ( (u0(i,jp,1)-u0(i,j,1))   *dyi &
                         +(v0(i,jp,1)-v0(i-1,jp,1))*dxi) &
               -emmo * ( (u0(i,j,1)-u0(i,jm,1))   *dyi &
-                        +(v0(i,j,1)-v0(i-1,j,1))  *dxi)   ) / dy &
+                        +(v0(i,j,1)-v0(i-1,j,1))  *dxi)   ) / dy * anis_fac(k) &
                + &
               ( rhobh(2)/rhobf(1) * emop * ( (u0(i,j,2)-u0(i,j,1))    /dzh(2) &
                         +(w0(i,j,2)-w0(i-1,j,2))  *dxi) &
@@ -783,10 +846,10 @@ subroutine diffv (a_out)
               ( epmo * ( (v0(i+1,j,k)-v0(i,j,k))   *dxi &
                         +(u0(i+1,j,k)-u0(i+1,jm,k))*dyi) &
                 -emmo * ( (v0(i,j,k)-v0(i-1,j,k))   *dxi &
-                        +(u0(i,j,k)-u0(i,jm,k))    *dyi)   ) / dx &
+                        +(u0(i,j,k)-u0(i,jm,k))    *dyi)   ) / dx * anis_fac(k) &
                 + &
               (ekm(i,j,k) * (v0(i,jp,k)-v0(i,j,k)) &
-              -ekm(i,jm,k)* (v0(i,j,k)-v0(i,jm,k))  ) * 2. * dy2i &
+              -ekm(i,jm,k)* (v0(i,j,k)-v0(i,jm,k))  ) * 2. * dy2i * anis_fac(k) &
                 + &
               ( rhobh(kp)/rhobf(k) * eomp * ( (v0(i,j,kp)-v0(i,j,k))    /dzh(kp) &
                         +(w0(i,j,kp)-w0(i,jm,kp))  *dyi) &
@@ -833,10 +896,10 @@ subroutine diffv (a_out)
                   ( epmo * ( (v0(i+1,j,1)-v0(i,j,1))   *dxi &
                             +(u0(i+1,j,1)-u0(i+1,jm,1))*dyi) &
                     -emmo * ( (v0(i,j,1)-v0(i-1,j,1))   *dxi &
-                            +(u0(i,j,1)-u0(i,jm,1))    *dyi)   ) / dx &
+                            +(u0(i,j,1)-u0(i,jm,1))    *dyi)   ) / dx * anis_fac(k) &
                   + &
                 ( ekm(i,j,1) * (v0(i,jp,1)-v0(i,j,1)) &
-                  -ekm(i,jm,1)* (v0(i,j,1)-v0(i,jm,1))  ) * 2. * dy2i &
+                  -ekm(i,jm,1)* (v0(i,j,1)-v0(i,jm,1))  ) * 2. * dy2i * anis_fac(k) &
                   + &
                 ( rhobh(2)/rhobf(1) * eomp * ( (v0(i,j,2)-v0(i,j,1))     /dzh(2) &
                           +(w0(i,j,2)-w0(i,jm,2))    *dyi) &
@@ -891,12 +954,12 @@ subroutine diffw(a_out)
                   ( epom * ( (w0(i+1,j,k)-w0(i,j,k))    *dxi &
                             +(u0(i+1,j,k)-u0(i+1,j,km)) /dzh(k) ) &
                     -emom * ( (w0(i,j,k)-w0(i-1,j,k))    *dxi &
-                            +(u0(i,j,k)-u0(i,j,km))     /dzh(k) ))/dx &
+                            +(u0(i,j,k)-u0(i,j,km))     /dzh(k) ))/dx * anis_fac(k) &
                 + &
                   ( eopm * ( (w0(i,jp,k)-w0(i,j,k))     *dyi &
                             +(v0(i,jp,k)-v0(i,jp,km))   /dzh(k) ) &
                     -eomm * ( (w0(i,j,k)-w0(i,jm,k))     *dyi &
-                            +(v0(i,j,k)-v0(i,j,km))     /dzh(k) ))/dy &
+                            +(v0(i,j,k)-v0(i,j,km))     /dzh(k) ))/dy * anis_fac(k) &
                 + (1./rhobh(k))*&
                   ( rhobf(k) * ekm(i,j,k) * (w0(i,j,kp)-w0(i,j,k)) /dzf(k) &
                   - rhobf(km) * ekm(i,j,km)* (w0(i,j,k)-w0(i,j,km)) /dzf(km) ) * 2. &
diff --git a/src/modsubgriddata.f90 b/src/modsubgriddata.f90
index 81569c36b..8f8ad4fb2 100644
--- a/src/modsubgriddata.f90
+++ b/src/modsubgriddata.f90
@@ -35,6 +35,8 @@ module modsubgriddata
   logical :: ldelta       = .false. !<  switch for subgrid length formulation (on/off)
   logical :: lmason       = .false. !<  switch for decreased length scale near the surface
   logical :: lsmagorinsky = .false. !<  switch for smagorinsky subgrid scheme
+  logical :: lanisotrop   = .false. !<  switch for anisotropic diffusion
+
   real :: cf      = 2.5  !< filter constant
   real :: Rigc    = 0.25 !< critical Richardson number
   real :: Prandtl = (1.0/3.0)
@@ -42,6 +44,7 @@ module modsubgriddata
   real :: cn      = 0.76
   real :: ch1     = 1.
   real :: ch2     = 2.
+  real :: ch                   ! ch = ch1 + ch2
   real :: ce1     = 0.19
   real :: ce2     = 0.51
   real :: cs      = -1
@@ -60,5 +63,6 @@ module modsubgriddata
 
   real, allocatable :: csz(:)       !< Smagorinsky constant
 
+  real, allocatable :: anis_fac(:)  !< grid anisotropy factor 
 end module modsubgriddata
 

From f23bb5a8de0300f8c77c2b876f9cc38b9491129b Mon Sep 17 00:00:00 2001
From: Caspar Jungbacker <caspar.jungbacker@outlook.com>
Date: Fri, 9 Jun 2023 10:26:08 +0200
Subject: [PATCH 04/19] Clean up modsubgrid

- Separated the calculations for the first level in the Smagorinsky model. This improves data parallelism.
- Removed statements like km=k-1
---
 src/modsubgrid.f90 | 142 +++++++++++++++++++++++----------------------
 1 file changed, 72 insertions(+), 70 deletions(-)

diff --git a/src/modsubgrid.f90 b/src/modsubgrid.f90
index ab5598061..1b55b3b5c 100644
--- a/src/modsubgrid.f90
+++ b/src/modsubgrid.f90
@@ -205,78 +205,80 @@ subroutine closure
   integer :: i,j,k,kp,km,jp,jm
 
   if(lsmagorinsky) then
-    do k = 1,kmax
-      mlen        = csz(k) * delta(k)
-
+    ! First level
+    mlen = csz(1) * delta(1)
+    do i = 2,i1
+      do j = 2,j1
+        strain2 =  ( &
+          ((u0(i+1,j,1)-u0(i,j,1))   *dxi        )**2    + &
+          ((v0(i,j+1,1)-v0(i,j,1))   *dyi        )**2    + &
+          ((w0(i,j,2)-w0(i,j,1))     /dzf(1)     )**2    )
+
+        strain2 = strain2 + 0.5 * ( &
+          ( 0.25*(w0(i+1,j,2)-w0(i-1,j,2))*dxi + &
+          dudz(i,j)   )**2 )
+
+        strain2 = strain2 + 0.125 * ( &
+          ((u0(i,j+1,1)-u0(i,j,1))     *dyi     + &
+           (v0(i,j+1,1)-v0(i-1,j+1,1)) *dxi        )**2    + &
+          ((u0(i,j,1)-u0(i,j-1,1))     *dyi     + &
+           (v0(i,j,1)-v0(i-1,j,1))     *dxi        )**2    + &
+          ((u0(i+1,j,1)-u0(i+1,j-1,1)) *dyi     + &
+           (v0(i+1,j,1)-v0(i,j,1))     *dxi        )**2    + &
+          ((u0(i+1,j+1,1)-u0(i+1,j,1)) *dyi     + &
+           (v0(i+1,j+1,1)-v0(i,j+1,1)) *dxi        )**2    )
+
+        strain2 = strain2 + 0.5 * ( &
+          (0.25*(w0(i,j+1,2)-w0(i,j-1,2))*dyi + &
+           dvdz(i,j)   )**2 )
+
+        ekm(i,j,1)  = mlen ** 2. * sqrt(2. * strain2)
+        ekh(i,j,1)  = ekm(i,j,1) / Prandtl
+
+        ekm(i,j,1) = max(ekm(i,j,1),ekmin)
+        ekh(i,j,1) = max(ekh(i,j,1),ekmin)
+      end do
+    end do
+    
+    ! Other levels
+    do k = 2,kmax
+      mlen = csz(k) * delta(k)
       do i = 2,i1
         do j = 2,j1
-
-          kp=k+1
-          km=k-1
-          jp=j+1
-          jm=j-1
-
-          if(k == 1) then
-            strain2 =  ( &
-              ((u0(i+1,j,k)-u0(i,j,k))   *dxi        )**2    + &
-              ((v0(i,jp,k)-v0(i,j,k))    *dyi        )**2    + &
-              ((w0(i,j,kp)-w0(i,j,k))    /dzf(k)     )**2    )
-
-            strain2 = strain2 + 0.5 * ( &
-              ( 0.25*(w0(i+1,j,kp)-w0(i-1,j,kp))*dxi + &
-              dudz(i,j)   )**2 )
-
-            strain2 = strain2 + 0.125 * ( &
-              ((u0(i,jp,k)-u0(i,j,k))     *dyi     + &
-              (v0(i,jp,k)-v0(i-1,jp,k))  *dxi        )**2    + &
-              ((u0(i,j,k)-u0(i,jm,k))     *dyi     + &
-              (v0(i,j,k)-v0(i-1,j,k))    *dxi        )**2    + &
-              ((u0(i+1,j,k)-u0(i+1,jm,k)) *dyi     + &
-              (v0(i+1,j,k)-v0(i,j,k))    *dxi        )**2    + &
-              ((u0(i+1,jp,k)-u0(i+1,j,k)) *dyi     + &
-              (v0(i+1,jp,k)-v0(i,jp,k))  *dxi        )**2    )
-
-            strain2 = strain2 + 0.5 * ( &
-              ( 0.25*(w0(i,jp,kp)-w0(i,jm,kp))*dyi + &
-              dvdz(i,j)   )**2 )
-
-          else
-
-            strain2 =  ( &
-              ((u0(i+1,j,k)-u0(i,j,k))   *dxi        )**2    + &
-              ((v0(i,jp,k)-v0(i,j,k))    *dyi        )**2    + &
-              ((w0(i,j,kp)-w0(i,j,k))    /dzf(k)     )**2    )
-
-            strain2 = strain2 + 0.125 * ( &
-              ((w0(i,j,kp)-w0(i-1,j,kp))  *dxi     + &
-              (u0(i,j,kp)-u0(i,j,k))     / dzh(kp)  )**2    + &
-              ((w0(i,j,k)-w0(i-1,j,k))    *dxi     + &
-              (u0(i,j,k)-u0(i,j,km))     / dzh(k)   )**2    + &
-              ((w0(i+1,j,k)-w0(i,j,k))    *dxi     + &
-              (u0(i+1,j,k)-u0(i+1,j,km)) / dzh(k)   )**2    + &
-              ((w0(i+1,j,kp)-w0(i,j,kp))  *dxi     + &
-              (u0(i+1,j,kp)-u0(i+1,j,k)) / dzh(kp)  )**2    )
-
-            strain2 = strain2 + 0.125 * ( &
-              ((u0(i,jp,k)-u0(i,j,k))     *dyi     + &
-              (v0(i,jp,k)-v0(i-1,jp,k))  *dxi        )**2    + &
-              ((u0(i,j,k)-u0(i,jm,k))     *dyi     + &
-              (v0(i,j,k)-v0(i-1,j,k))    *dxi        )**2    + &
-              ((u0(i+1,j,k)-u0(i+1,jm,k)) *dyi     + &
-              (v0(i+1,j,k)-v0(i,j,k))    *dxi        )**2    + &
-              ((u0(i+1,jp,k)-u0(i+1,j,k)) *dyi     + &
-              (v0(i+1,jp,k)-v0(i,jp,k))  *dxi        )**2    )
-
-            strain2 = strain2 + 0.125 * ( &
-              ((v0(i,j,kp)-v0(i,j,k))     / dzh(kp) + &
-              (w0(i,j,kp)-w0(i,jm,kp))   *dyi        )**2    + &
-              ((v0(i,j,k)-v0(i,j,km))     / dzh(k)+ &
-              (w0(i,j,k)-w0(i,jm,k))     *dyi        )**2    + &
-              ((v0(i,jp,k)-v0(i,jp,km))   / dzh(k)+ &
-              (w0(i,jp,k)-w0(i,j,k))     *dyi        )**2    + &
-              ((v0(i,jp,kp)-v0(i,jp,k))   / dzh(kp) + &
-              (w0(i,jp,kp)-w0(i,j,kp))   *dyi        )**2    )
-          end if
+          strain2 =  ( &
+            ((u0(i+1,j,k)-u0(i,j,k))    *dxi        )**2    + &
+            ((v0(i,j+1,k)-v0(i,j,k))    *dyi        )**2    + &
+            ((w0(i,j,k+1)-w0(i,j,k))    /dzf(k)     )**2    )
+
+          strain2 = strain2 + 0.125 * ( &
+            ((w0(i,j,k+1)-w0(i-1,j,k+1)) *dxi     + &
+             (u0(i,j,k+1)-u0(i,j,k))     /dzh(k+1)  )**2    + &
+            ((w0(i,j,k)-w0(i-1,j,k))     *dxi     + &
+             (u0(i,j,k)-u0(i,j,k-1))     /dzh(k)    )**2    + &
+            ((w0(i+1,j,k)-w0(i,j,k))     *dxi     + &
+             (u0(i+1,j,k)-u0(i+1,j,k-1)) /dzh(k)    )**2    + &
+            ((w0(i+1,j,k+1)-w0(i,j,k+1)) *dxi     + &
+             (u0(i+1,j,k+1)-u0(i+1,j,k)) /dzh(k+1)  )**2    )
+
+          strain2 = strain2 + 0.125 * ( &
+            ((u0(i,j+1,k)-u0(i,j,k))     *dyi     + &
+             (v0(i,j+1,k)-v0(i-1,j+1,k)) *dxi        )**2    + &
+            ((u0(i,j,k)-u0(i,j-1,k))     *dyi     + &
+             (v0(i,j,k)-v0(i-1,j,k))     *dxi        )**2    + &
+            ((u0(i+1,j,k)-u0(i+1,j-1,k)) *dyi     + &
+             (v0(i+1,j,k)-v0(i,j,k))     *dxi        )**2    + &
+            ((u0(i+1,j+1,k)-u0(i+1,j,k)) *dyi     + &
+             (v0(i+1,j+1,k)-v0(i,j+1,k)) *dxi        )**2    )
+
+          strain2 = strain2 + 0.125 * ( &
+            ((v0(i,j,k+1)-v0(i,j,k))     /dzh(k+1) + &
+            (w0(i,j,k+1)-w0(i,j-1,k+1))  *dyi        )**2    + &
+            ((v0(i,j,k)-v0(i,j,k-1))     /dzh(k)   + &
+            (w0(i,j,k)-w0(i,j-1,k))      *dyi        )**2    + &
+            ((v0(i,j+1,k)-v0(i,j+1,k-1)) /dzh(k)   + &
+            (w0(i,j+1,k)-w0(i,j,k))      *dyi        )**2    + &
+            ((v0(i,j+1,k+1)-v0(i,j+1,k)) /dzh(k+1) + &
+            (w0(i,j+1,k+1)-w0(i,j,k+1))  *dyi        )**2    )
 
           ekm(i,j,k)  = mlen ** 2. * sqrt(2. * strain2)
           ekh(i,j,k)  = ekm(i,j,k) / Prandtl

From b246e558008e10b24fd68bb7a0d3f25a0d485d51 Mon Sep 17 00:00:00 2001
From: Caspar Jungbacker <caspar.jungbacker@outlook.com>
Date: Fri, 9 Jun 2023 12:13:00 +0200
Subject: [PATCH 05/19] Add spaces [skip ci]

---
 src/modsubgrid.f90 | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/modsubgrid.f90 b/src/modsubgrid.f90
index 1b55b3b5c..d2b034a89 100644
--- a/src/modsubgrid.f90
+++ b/src/modsubgrid.f90
@@ -272,13 +272,13 @@ subroutine closure
 
           strain2 = strain2 + 0.125 * ( &
             ((v0(i,j,k+1)-v0(i,j,k))     /dzh(k+1) + &
-            (w0(i,j,k+1)-w0(i,j-1,k+1))  *dyi        )**2    + &
+             (w0(i,j,k+1)-w0(i,j-1,k+1)) *dyi        )**2    + &
             ((v0(i,j,k)-v0(i,j,k-1))     /dzh(k)   + &
-            (w0(i,j,k)-w0(i,j-1,k))      *dyi        )**2    + &
+             (w0(i,j,k)-w0(i,j-1,k))     *dyi        )**2    + &
             ((v0(i,j+1,k)-v0(i,j+1,k-1)) /dzh(k)   + &
-            (w0(i,j+1,k)-w0(i,j,k))      *dyi        )**2    + &
+             (w0(i,j+1,k)-w0(i,j,k))     *dyi        )**2    + &
             ((v0(i,j+1,k+1)-v0(i,j+1,k)) /dzh(k+1) + &
-            (w0(i,j+1,k+1)-w0(i,j,k+1))  *dyi        )**2    )
+             (w0(i,j+1,k+1)-w0(i,j,k+1)) *dyi        )**2    )
 
           ekm(i,j,k)  = mlen ** 2. * sqrt(2. * strain2)
           ekh(i,j,k)  = ekm(i,j,k) / Prandtl

From 457d34aa59d3a1e0e43a9a172692657064ee1428 Mon Sep 17 00:00:00 2001
From: Caspar Jungbacker <caspar.jungbacker@outlook.com>
Date: Sat, 10 Jun 2023 18:29:51 +0200
Subject: [PATCH 06/19] Remove unused variables

---
 src/modsubgrid.f90 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/modsubgrid.f90 b/src/modsubgrid.f90
index 1f25b2423..aed86c371 100644
--- a/src/modsubgrid.f90
+++ b/src/modsubgrid.f90
@@ -223,7 +223,7 @@ subroutine closure
   implicit none
 
   real    :: strain2,mlen
-  integer :: i,j,k,kp,km,jp,jm
+  integer :: i,j,k
 
   if(lsmagorinsky) then
     ! First level

From f46fa37d441b16fe6e478fd4def7f78125c45880 Mon Sep 17 00:00:00 2001
From: Caspar Jungbacker <caspar.jungbacker@outlook.com>
Date: Sun, 11 Jun 2023 20:28:18 +0200
Subject: [PATCH 07/19] Temporary fix for halo transfers

---
 src/modmpi.f90 | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/src/modmpi.f90 b/src/modmpi.f90
index 5564689d8..079281cb5 100644
--- a/src/modmpi.f90
+++ b/src/modmpi.f90
@@ -35,6 +35,9 @@
 
 module modmpi
 use modmpiinterface
+#ifdef _OPENACC
+use openacc
+#endif
 implicit none
 save
   type(MPI_COMM) :: commwrld, comm3d, commrow, commcol
@@ -344,8 +347,14 @@ subroutine excjs_real32(a,sx,ex,sy,ey,sz,ez,ih,jh)
                                           , sends,recvs &
                                           , sende,recve &
                                           , sendw,recvw
+  ! Check if the data is on the gpu
+  #ifdef _OPENACC
+  integer :: is_present
+  is_present = acc_is_present(a)
+  #endif
 
 ! Calulate buffer lengths
+  !$acc kernels if(is_present) async
   xl = size(a,1)
   yl = size(a,2)
   zl = size(a,3)
@@ -382,8 +391,10 @@ subroutine excjs_real32(a,sx,ex,sy,ey,sz,ez,ih,jh)
   else
 
     ! Single processor, make sure the field is periodic
+    !$acc kernels default(present)
     a(:,sy-jh:sy-1,:) = a(:,ey-jh+1:ey,:)
     a(:,ey+1:ey+jh,:) = a(:,sy:sy+jh-1,:)
+    !$acc end kernels
 
   endif
 
@@ -414,9 +425,10 @@ subroutine excjs_real32(a,sx,ex,sy,ey,sz,ez,ih,jh)
   else
 
     ! Single processor, make sure the field is periodic
+    !$acc kernels default(present)
     a(sx-ih:sx-1,:,:) = a(ex-ih+1:ex,:,:)
     a(ex+1:ex+ih,:,:) = a(sx:sx+ih-1,:,:)
-
+    !$acc end kernels
   endif
 
   if(nprocy.gt.1)then
@@ -456,11 +468,18 @@ subroutine excjs_real64(a,sx,ex,sy,ey,sz,ez,ih,jh)
                                           , sends,recvs &
                                           , sende,recve &
                                           , sendw,recvw
+  ! Check if the data is on the gpu
+  #ifdef _OPENACC
+  integer :: is_present
+  is_present = acc_is_present(a)
+  #endif
 
 ! Calulate buffer lengths
+  !$acc kernels default(present) if(is_present) async(1)
   xl = size(a,1)
   yl = size(a,2)
   zl = size(a,3)
+  !$acc end kernels
 
 !   Calculate buffer size
   nssize = xl*jh*zl
@@ -495,8 +514,10 @@ subroutine excjs_real64(a,sx,ex,sy,ey,sz,ez,ih,jh)
   else
 
     ! Single processor, make sure the field is periodic
+    !$acc kernels default(present) if(is_present) async(1)
     a(:,sy-jh:sy-1,:) = a(:,ey-jh+1:ey,:)
     a(:,ey+1:ey+jh,:) = a(:,sy:sy+jh-1,:)
+    !$acc end kernels
 
   endif
 
@@ -527,8 +548,10 @@ subroutine excjs_real64(a,sx,ex,sy,ey,sz,ez,ih,jh)
   else
 
     ! Single processor, make sure the field is periodic
+    !$acc kernels default(present) if(is_present) async(1)
     a(sx-ih:sx-1,:,:) = a(ex-ih+1:ex,:,:)
     a(ex+1:ex+ih,:,:) = a(sx:sx+ih-1,:,:)
+    !$acc end kernels
 
   endif
 

From edd82bc585be3b30c8e3b4b709f0a671d0619db4 Mon Sep 17 00:00:00 2001
From: Caspar Jungbacker <caspar.jungbacker@outlook.com>
Date: Mon, 12 Jun 2023 17:03:55 +0200
Subject: [PATCH 08/19] Add temporary timers

---
 src/modsubgrid.f90 | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/modsubgrid.f90 b/src/modsubgrid.f90
index aed86c371..9e58d9c24 100644
--- a/src/modsubgrid.f90
+++ b/src/modsubgrid.f90
@@ -31,6 +31,7 @@
 module modsubgrid
 use modsubgriddata
 use modprecision, only: field_r
+use modtimer, only: timer_tic, timer_toc
 implicit none
 save
   public :: subgrid, initsubgrid, exitsubgrid, subgridnamelist
@@ -162,8 +163,10 @@ subroutine subgrid
     use modsurfdata,only : thlflux,qtflux,svflux
     implicit none
     integer n
-
+    
+    call timer_tic("closure", 0)
     call closure
+    call timer_toc("closure")
     call diffu(up)
     call diffv(vp)
     call diffw(wp)
@@ -226,6 +229,7 @@ subroutine closure
   integer :: i,j,k
 
   if(lsmagorinsky) then
+    call timer_tic("Smagorinsky", 1)
     ! First level
     mlen = csz(1) * delta(1)
     do i = 2,i1
@@ -309,7 +313,7 @@ subroutine closure
         end do
       end do
     end do
-
+    call timer_toc("Smagorinsky")
   ! do TKE scheme
  else
     ! choose one of ldelta, ldelta+lmason, lanisotropic, or none of them for Deardorff length scale adjustment
@@ -385,20 +389,22 @@ subroutine closure
        end do
     end if
   end if
-
 !*************************************************************
 !     Set cyclic boundary condition for K-closure factors.
 !*************************************************************
-
+  call timer_tic("Boundary conditions", 2)
   call excjs( ekm           , 2,i1,2,j1,1,k1,ih,jh)
   call excjs( ekh           , 2,i1,2,j1,1,k1,ih,jh)
+  call timer_toc("Boundary conditions") 
 
+  call timer_tic("Write buffer", 3)
   do j=1,j2
     do i=1,i2
       ekm(i,j,k1)  = ekm(i,j,kmax)
       ekh(i,j,k1)  = ekh(i,j,kmax)
     end do
   end do
+  call timer_toc("Write buffer")
 
   return
   end subroutine closure

From c9ac6194241a27bec24501782f7530030ad3fbad Mon Sep 17 00:00:00 2001
From: Caspar Jungbacker <caspar.jungbacker@outlook.com>
Date: Mon, 12 Jun 2023 20:46:30 +0200
Subject: [PATCH 09/19] Close kernel region

---
 src/modmpi.f90 | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/modmpi.f90 b/src/modmpi.f90
index 079281cb5..eca1166d5 100644
--- a/src/modmpi.f90
+++ b/src/modmpi.f90
@@ -358,6 +358,7 @@ subroutine excjs_real32(a,sx,ex,sy,ey,sz,ez,ih,jh)
   xl = size(a,1)
   yl = size(a,2)
   zl = size(a,3)
+  !$acc end kernels 
 
 !   Calculate buffer size
   nssize = xl*jh*zl

From 822cb9991c6bb2dfe9e9a0992fdbed1217dc7703 Mon Sep 17 00:00:00 2001
From: Caspar Jungbacker <caspar.jungbacker@outlook.com>
Date: Tue, 13 Jun 2023 09:25:15 +0200
Subject: [PATCH 10/19] Clean up subgrid

---
 src/modsubgrid.f90 | 491 +++++++++++++++++++++++++--------------------
 1 file changed, 270 insertions(+), 221 deletions(-)

diff --git a/src/modsubgrid.f90 b/src/modsubgrid.f90
index 9e58d9c24..334541b5e 100644
--- a/src/modsubgrid.f90
+++ b/src/modsubgrid.f90
@@ -442,56 +442,49 @@ subroutine sources
   real    tdef2, uwflux, vwflux, local_dudz, local_dvdz, local_dthvdz, horv
   integer i,j,k,jm,jp,km,kp
 
-
   do k=2,kmax
-  do j=2,j1
-  do i=2,i1
-    kp=k+1
-    km=k-1
-    jp=j+1
-    jm=j-1
-
-    tdef2 = 2. * ( &
-             ((u0(i+1,j,k)-u0(i,j,k))   /dx         )**2    + &
-             ((v0(i,jp,k)-v0(i,j,k))    /dy         )**2    + &
-             ((w0(i,j,kp)-w0(i,j,k))    /dzf(k)     )**2    )
-
-    tdef2 = tdef2 + 0.25 * ( &
-              ((w0(i,j,kp)-w0(i-1,j,kp))  / dx     + &
-               (u0(i,j,kp)-u0(i,j,k))     / dzh(kp)  )**2    + &
-              ((w0(i,j,k)-w0(i-1,j,k))    / dx     + &
-               (u0(i,j,k)-u0(i,j,km))     / dzh(k)   )**2    + &
-              ((w0(i+1,j,k)-w0(i,j,k))    / dx     + &
-               (u0(i+1,j,k)-u0(i+1,j,km)) / dzh(k)   )**2    + &
-              ((w0(i+1,j,kp)-w0(i,j,kp))  / dx     + &
-               (u0(i+1,j,kp)-u0(i+1,j,k)) / dzh(kp)  )**2    )
-
-    tdef2 = tdef2 + 0.25 * ( &
-              ((u0(i,jp,k)-u0(i,j,k))     / dy     + &
-               (v0(i,jp,k)-v0(i-1,jp,k))  / dx        )**2    + &
-              ((u0(i,j,k)-u0(i,jm,k))     / dy     + &
-               (v0(i,j,k)-v0(i-1,j,k))    / dx        )**2    + &
-              ((u0(i+1,j,k)-u0(i+1,jm,k)) / dy     + &
-               (v0(i+1,j,k)-v0(i,j,k))    / dx        )**2    + &
-              ((u0(i+1,jp,k)-u0(i+1,j,k)) / dy     + &
-               (v0(i+1,jp,k)-v0(i,jp,k))  / dx        )**2    )
-
-    tdef2 = tdef2 + 0.25 * ( &
-              ((v0(i,j,kp)-v0(i,j,k))     / dzh(kp) + &
-               (w0(i,j,kp)-w0(i,jm,kp))   / dy        )**2    + &
-              ((v0(i,j,k)-v0(i,j,km))     / dzh(k)+ &
-               (w0(i,j,k)-w0(i,jm,k))     / dy        )**2    + &
-              ((v0(i,jp,k)-v0(i,jp,km))   / dzh(k)+ &
-               (w0(i,jp,k)-w0(i,j,k))     / dy        )**2    + &
-              ((v0(i,jp,kp)-v0(i,jp,k))   / dzh(kp) + &
-               (w0(i,jp,kp)-w0(i,j,kp))   / dy        )**2    )
-
-    e12p(i,j,k) = e12p(i,j,k) &
-                + (ekm(i,j,k)*tdef2 - ekh(i,j,k)*grav/thvf(k)*dthvdz(i,j,k) ) / (2*e120(i,j,k)) &  !  sbshr and sbbuo
-                - (ce1 + ce2*zlt(i,j,k)*deltai(k)) * e120(i,j,k)**2 /(2.*zlt(i,j,k))               !  sbdiss
-    
-  end do
-  end do
+    do j=2,j1
+      do i=2,i1
+        tdef2 = 2. * ( &
+                 ((u0(i+1,j,k)-u0(i,j,k))    /dx         )**2    + &
+                 ((v0(i,j+1,k)-v0(i,j,k))    /dy         )**2    + &
+                 ((w0(i,j,k+1)-w0(i,j,k))    /dzf(k)     )**2    )
+
+        tdef2 = tdef2 + 0.25 * ( &
+                  ((w0(i,j,k+1)-w0(i-1,j,k+1))  / dx       + &
+                   (u0(i,j,k+1)-u0(i,j,k))      / dzh(k+1) )**2    + &
+                  ((w0(i,j,k)-w0(i-1,j,k))      / dx       + &
+                   (u0(i,j,k)-u0(i,j,k-1))      / dzh(k)   )**2    + &
+                  ((w0(i+1,j,k)-w0(i,j,k))      / dx       + &
+                   (u0(i+1,j,k)-u0(i+1,j,k-1))  / dzh(k)   )**2    + &
+                  ((w0(i+1,j,k+1)-w0(i,j,k+1))  / dx       + &
+                   (u0(i+1,j,k+1)-u0(i+1,j,k))  / dzh(k+1) )**2    )
+
+        tdef2 = tdef2 + 0.25 * ( &
+                  ((u0(i,j+1,k)-u0(i,j,k))      / dy + &
+                   (v0(i,j+1,k)-v0(i-1,j+1,k))  / dx )**2    + &
+                  ((u0(i,j,k)-u0(i,j-1,k))      / dy + &
+                   (v0(i,j,k)-v0(i-1,j,k))      / dx )**2    + &
+                  ((u0(i+1,j,k)-u0(i+1,j-1,k))  / dy + &
+                   (v0(i+1,j,k)-v0(i,j,k))      / dx )**2    + &
+                  ((u0(i+1,j+1,k)-u0(i+1,j,k))  / dy + &
+                   (v0(i+1,j+1,k)-v0(i,j+1,k))  / dx )**2    )
+
+        tdef2 = tdef2 + 0.25 * ( &
+                  ((v0(i,j,k+1)-v0(i,j,k))     / dzh(k+1) + &
+                   (w0(i,j,k+1)-w0(i,j-1,k+1)) / dy       )**2    + &
+                  ((v0(i,j,k)-v0(i,j,k-1))     / dzh(k)   + &
+                   (w0(i,j,k)-w0(i,j-1,k))     / dy       )**2    + &
+                  ((v0(i,j+1,k)-v0(i,j+1,k-1)) / dzh(k)   + &
+                   (w0(i,j+1,k)-w0(i,j,k))     / dy       )**2    + &
+                  ((v0(i,j+1,k+1)-v0(i,j+1,k)) / dzh(k+1) + &
+                   (w0(i,j+1,k+1)-w0(i,j,k+1)) / dy       )**2    )
+
+        e12p(i,j,k) = e12p(i,j,k) &
+                    + (ekm(i,j,k)*tdef2 - ekh(i,j,k)*grav/thvf(k)*dthvdz(i,j,k) ) / (2*e120(i,j,k)) &  !  sbshr and sbbuo
+                    - (ce1 + ce2*zlt(i,j,k)*deltai(k)) * e120(i,j,k)**2 /(2.*zlt(i,j,k))               !  sbdiss
+      end do
+    end do
   end do
 !     ----------------------------------------------end i,j,k-loop
 
@@ -499,77 +492,148 @@ subroutine sources
 !     special treatment for lowest full level: k=1
 !     --------------------------------------------
 
-
-  do j=2,j1
-    jp=j+1
-    jm=j-1
-  do i=2,i1
-
-
-
-! **  Calculate "shear" production term: tdef2  ****************
-
-    tdef2 =  2. * ( &
-            ((u0(i+1,j,1)-u0(i,j,1))*dxi)**2 &
-          + ((v0(i,jp,1)-v0(i,j,1))*dyi)**2 &
-          + ((w0(i,j,2)-w0(i,j,1))/dzf(1))**2   )
-
-    if (sgs_surface_fix) then
-          ! Use known surface flux and exchange coefficient to derive 
-          ! consistent gradient (such that correct flux will occur in 
-          ! shear production term)
-          ! Make sure that no division by zero occurs in determination of the
-          ! directional component; ekm should already be >= ekmin
-          ! Replace the dudz by surface flux -uw / ekm
-          horv = max(sqrt((u0(i,j,1)+cu)**2+(v0(i,j,1)+cv)**2),  0.01)
-          uwflux = -ustar(i,j)*ustar(i,j)* ((u0(i,j,1)+cu)/horv)
-          local_dudz = -uwflux / ekm(i,j,1)
-          tdef2 = tdef2 + ( 0.25*(w0(i+1,j,2)-w0(i-1,j,2))*dxi + &
-               local_dudz )**2
-    else
-          tdef2 = tdef2 + ( 0.25*(w0(i+1,j,2)-w0(i-1,j,2))*dxi + &
-                                  dudz(i,j)   )**2
-    endif
-
-    tdef2 = tdef2 +   0.25 *( &
-          ((u0(i,jp,1)-u0(i,j,1))*dyi+(v0(i,jp,1)-v0(i-1,jp,1))*dxi)**2 &
-         +((u0(i,j,1)-u0(i,jm,1))*dyi+(v0(i,j,1)-v0(i-1,j,1))*dxi)**2 &
-         +((u0(i+1,j,1)-u0(i+1,jm,1))*dyi+(v0(i+1,j,1)-v0(i,j,1))*dxi)**2 &
-         +((u0(i+1,jp,1)-u0(i+1,j,1))*dyi+ &
-                                 (v0(i+1,jp,1)-v0(i,jp,1))*dxi)**2   )
-
-    if (sgs_surface_fix) then
-          ! Use known surface flux and exchange coefficient to derive 
-          ! consistent gradient (such that correct flux will occur in 
-          ! shear production term)
-          ! Make sure that no division by zero occurs in determination of the
-          ! directional component; ekm should already be >= ekmin
-          ! Replace the dvdz by surface flux -vw / ekm
-          horv = max(sqrt((u0(i,j,1)+cu)**2+(v0(i,j,1)+cv)**2),  0.01)
-          vwflux = -ustar(i,j)*ustar(i,j)* ((v0(i,j,1)+cv)/horv)
-          local_dvdz = -vwflux / ekm(i,j,1)
-          tdef2 = tdef2 + ( 0.25*(w0(i,jp,2)-w0(i,jm,2))*dyi + &
-                        local_dvdz  )**2
-    else
-         tdef2 = tdef2 + ( 0.25*(w0(i,jp,2)-w0(i,jm,2))*dyi + &
+  if (sgs_surface_fix) then
+    do j=2,j1
+      do i=2,i1
+        tdef2 = 2. * ( &
+                ((u0(i+1,j,1)-u0(i,j,1)) *dxi    )**2 &
+              + ((v0(i,j+1,1)-v0(i,j,1)) *dyi    )**2 &
+              + ((w0(i,j,2)-w0(i,j,1))   /dzf(1) )**2   )
+        ! Use known surface flux and exchange coefficient to derive 
+        ! consistent gradient (such that correct flux will occur in 
+        ! shear production term)
+        ! Make sure that no division by zero occurs in determination of the
+        ! directional component; ekm should already be >= ekmin
+        ! Replace the dudz by surface flux -uw / ekm
+        horv = max(sqrt((u0(i,j,1)+cu)**2 + (v0(i,j,1)+cv)**2), 0.01)
+        uwflux = -ustar(i,j)*ustar(i,j) * ((u0(i,j,1)+cu)/horv)
+        local_dudz = -uwflux / ekm(i,j,1)
+
+        tdef2 = tdef2 +  ( 0.25*(w0(i+1,j,2)-w0(i-1,j,2))*dxi + &
+                local_dudz )**2
+
+        tdef2 = tdef2 +   0.25 *( &
+               ((u0(i,j+1,1)-u0(i,j,1))*dyi+(v0(i,j+1,1)-v0(i-1,j+1,1))*dxi)**2 &
+              +((u0(i,j,1)-u0(i,j-1,1))*dyi+(v0(i,j,1)-v0(i-1,j,1))*dxi)**2 &
+              +((u0(i+1,j,1)-u0(i+1,j-1,1))*dyi+(v0(i+1,j,1)-v0(i,j,1))*dxi)**2 &
+              +((u0(i+1,j+1,1)-u0(i+1,j,1))*dyi &
+              +(v0(i+1,j+1,1)-v0(i,j+1,1))*dxi)**2   )
+
+        ! Use known surface flux and exchange coefficient to derive 
+        ! consistent gradient (such that correct flux will occur in 
+        ! shear production term)
+        ! Make sure that no division by zero occurs in determination of the
+        ! directional component; ekm should already be >= ekmin
+        ! Replace the dvdz by surface flux -vw / ekm
+        vwflux = -ustar(i,j)*ustar(i,j)* ((v0(i,j,1)+cv)/horv)
+        local_dvdz = -vwflux / ekm(i,j,1)
+        tdef2 = tdef2 + ( 0.25*(w0(i,j+1,2)-w0(i,j-1,2))*dyi + &
+                      local_dvdz  )**2
+
+        ! **  Include shear and buoyancy production terms and dissipation **
+        sbshr(i,j,1)  = ekm(i,j,1)*tdef2/ ( 2*e120(i,j,1))
+
+        ! Replace the -ekh *  dthvdz by the surface flux of thv
+        ! (but we only have the thlflux , which seems at the surface to be
+        ! equivalent
+        local_dthvdz = -thlflux(i,j)/ekh(i,j,1)
+        sbbuo(i,j,1)  = -ekh(i,j,1)*grav/thvf(1)*local_dthvdz/ ( 2*e120(i,j,1))
+        sbdiss(i,j,1) = - (ce1 + ce2*zlt(i,j,1)*deltai(1)) * e120(i,j,1)**2 /(2.*zlt(i,j,1))
+      end do
+    end do
+  else
+    do j=2,j1
+      do i=2,i1
+        tdef2 = 2. * ( &
+                ((u0(i+1,j,1)-u0(i,j,1)) *dxi    )**2 &
+              + ((v0(i,j+1,1)-v0(i,j,1)) *dyi    )**2 &
+              + ((w0(i,j,2)-w0(i,j,1))   /dzf(1) )**2   )
+
+        tdef2 = tdef2 + ( 0.25*(w0(i+1,j,2)-w0(i-1,j,2))*dxi + &
+                                dudz(i,j)   )**2
+
+        tdef2 = tdef2 +   0.25 *( &
+               ((u0(i,j+1,1)-u0(i,j,1))*dyi+(v0(i,j+1,1)-v0(i-1,j+1,1))*dxi)**2 &
+              +((u0(i,j,1)-u0(i,j-1,1))*dyi+(v0(i,j,1)-v0(i-1,j,1))*dxi)**2 &
+              +((u0(i+1,j,1)-u0(i+1,j-1,1))*dyi+(v0(i+1,j,1)-v0(i,j,1))*dxi)**2 &
+              +((u0(i+1,j+1,1)-u0(i+1,j,1))*dyi &
+              +(v0(i+1,j+1,1)-v0(i,j+1,1))*dxi)**2   )
+
+        tdef2 = tdef2 + ( 0.25*(w0(i,j+1,2)-w0(i,j-1,2))*dyi + &
                                 dvdz(i,j)   )**2
-    endif
 
-! **  Include shear and buoyancy production terms and dissipation **
+        sbshr(i,j,1)  = ekm(i,j,1)*tdef2/ ( 2*e120(i,j,1))
+        sbbuo(i,j,1)  = -ekh(i,j,1)*grav/thvf(1)*dthvdz(i,j,1)/ ( 2*e120(i,j,1))
+        sbdiss(i,j,1) = - (ce1 + ce2*zlt(i,j,1)*deltai(1)) * e120(i,j,1)**2 /(2.*zlt(i,j,1))
+      end do 
+    end do
+  endif
 
-    sbshr(i,j,1)  = ekm(i,j,1)*tdef2/ ( 2*e120(i,j,1))
-    if (sgs_surface_fix) then
-          ! Replace the -ekh *  dthvdz by the surface flux of thv
-          ! (but we only have the thlflux , which seems at the surface to be
-          ! equivalent
-          local_dthvdz = -thlflux(i,j)/ekh(i,j,1)
-          sbbuo(i,j,1)  = -ekh(i,j,1)*grav/thvf(1)*local_dthvdz/ ( 2*e120(i,j,1))
-    else
-          sbbuo(i,j,1)  = -ekh(i,j,1)*grav/thvf(1)*dthvdz(i,j,1)/ ( 2*e120(i,j,1))
-    endif
-    sbdiss(i,j,1) = - (ce1 + ce2*zlt(i,j,1)*deltai(1)) * e120(i,j,1)**2 /(2.*zlt(i,j,1))
-  end do
-  end do
+
+!  do j=2,j1
+!    do i=2,i1
+!    ! **  Calculate "shear" production term: tdef2  ****************
+!    tdef2 =  2. * ( &
+!            ((u0(i+1,j,1)-u0(i,j,1))*dxi)**2 &
+!          + ((v0(i,j+1,1)-v0(i,j,1))*dyi)**2 &
+!          + ((w0(i,j,2)-w0(i,j,1))/dzf(1))**2   )
+!
+!    if (sgs_surface_fix) then
+!          ! Use known surface flux and exchange coefficient to derive 
+!          ! consistent gradient (such that correct flux will occur in 
+!          ! shear production term)
+!          ! Make sure that no division by zero occurs in determination of the
+!          ! directional component; ekm should already be >= ekmin
+!          ! Replace the dudz by surface flux -uw / ekm
+!          horv = max(sqrt((u0(i,j,1)+cu)**2+(v0(i,j,1)+cv)**2),  0.01)
+!          uwflux = -ustar(i,j)*ustar(i,j)* ((u0(i,j,1)+cu)/horv)
+!          local_dudz = -uwflux / ekm(i,j,1)
+!          tdef2 = tdef2 + ( 0.25*(w0(i+1,j,2)-w0(i-1,j,2))*dxi + &
+!               local_dudz )**2
+!    else
+!          tdef2 = tdef2 + ( 0.25*(w0(i+1,j,2)-w0(i-1,j,2))*dxi + &
+!                                  dudz(i,j)   )**2
+!    endif
+!
+!    tdef2 = tdef2 +   0.25 *( &
+!          ((u0(i,j+1,1)-u0(i,j,1))*dyi+(v0(i,j+1,1)-v0(i-1,j+1,1))*dxi)**2 &
+!         +((u0(i,j,1)-u0(i,j-1,1))*dyi+(v0(i,j,1)-v0(i-1,j,1))*dxi)**2 &
+!         +((u0(i+1,j,1)-u0(i+1,j-1,1))*dyi+(v0(i+1,j,1)-v0(i,j,1))*dxi)**2 &
+!         +((u0(i+1,j+1,1)-u0(i+1,j,1))*dyi+ &
+!                                 (v0(i+1,j+1,1)-v0(i,j+1,1))*dxi)**2   )
+!
+!    if (sgs_surface_fix) then
+!          ! Use known surface flux and exchange coefficient to derive 
+!          ! consistent gradient (such that correct flux will occur in 
+!          ! shear production term)
+!          ! Make sure that no division by zero occurs in determination of the
+!          ! directional component; ekm should already be >= ekmin
+!          ! Replace the dvdz by surface flux -vw / ekm
+!          horv = max(sqrt((u0(i,j,1)+cu)**2+(v0(i,j,1)+cv)**2),  0.01)
+!          vwflux = -ustar(i,j)*ustar(i,j)* ((v0(i,j,1)+cv)/horv)
+!          local_dvdz = -vwflux / ekm(i,j,1)
+!          tdef2 = tdef2 + ( 0.25*(w0(i,j+1,2)-w0(i,j-1,2))*dyi + &
+!                        local_dvdz  )**2
+!    else
+!         tdef2 = tdef2 + ( 0.25*(w0(i,j+1,2)-w0(i,j-1,2))*dyi + &
+!                                dvdz(i,j)   )**2
+!    endif
+!
+!! **  Include shear and buoyancy production terms and dissipation **
+!
+!    sbshr(i,j,1)  = ekm(i,j,1)*tdef2/ ( 2*e120(i,j,1))
+!    if (sgs_surface_fix) then
+!          ! Replace the -ekh *  dthvdz by the surface flux of thv
+!          ! (but we only have the thlflux , which seems at the surface to be
+!          ! equivalent
+!          local_dthvdz = -thlflux(i,j)/ekh(i,j,1)
+!          sbbuo(i,j,1)  = -ekh(i,j,1)*grav/thvf(1)*local_dthvdz/ ( 2*e120(i,j,1))
+!    else
+!          sbbuo(i,j,1)  = -ekh(i,j,1)*grav/thvf(1)*dthvdz(i,j,1)/ ( 2*e120(i,j,1))
+!    endif
+!    sbdiss(i,j,1) = - (ce1 + ce2*zlt(i,j,1)*deltai(1)) * e120(i,j,1)**2 /(2.*zlt(i,j,1))
+!  end do
+!  end do
 
   e12p(2:i1,2:j1,1) = e12p(2:i1,2:j1,1) + &
             sbshr(2:i1,2:j1,1)+sbbuo(2:i1,2:j1,1)+sbdiss(2:i1,2:j1,1)  
@@ -603,14 +667,14 @@ subroutine diffc (a_in,a_out,flux)
                   ( (ekh(i+1,j,k)+ekh(i,j,k))*(a_in(i+1,j,k)-a_in(i,j,k)) &
                     -(ekh(i,j,k)+ekh(i-1,j,k))*(a_in(i,j,k)-a_in(i-1,j,k)))*dx2i * anis_fac(k) &
                     + &
-                  ( (ekh(i,jp,k)+ekh(i,j,k)) *(a_in(i,jp,k)-a_in(i,j,k)) &
-                    -(ekh(i,j,k)+ekh(i,jm,k)) *(a_in(i,j,k)-a_in(i,jm,k)) )*dy2i * anis_fac(k) &
+                  ( (ekh(i,j+1,k)+ekh(i,j,k)) *(a_in(i,j+1,k)-a_in(i,j,k)) &
+                    -(ekh(i,j,k)+ekh(i,j-1,k)) *(a_in(i,j,k)-a_in(i,j-1,k)) )*dy2i * anis_fac(k) &
                   + &
-                  ( rhobh(kp)/rhobf(k) * (dzf(kp)*ekh(i,j,k) + dzf(k)*ekh(i,j,kp)) &
-                    *  (a_in(i,j,kp)-a_in(i,j,k)) / dzh(kp)**2 &
+                  ( rhobh(k+1)/rhobf(k) * (dzf(k+1)*ekh(i,j,k) + dzf(k)*ekh(i,j,k+1)) &
+                    *  (a_in(i,j,k+1)-a_in(i,j,k)) / dzh(k+1)**2 &
                     - &
-                    rhobh(k)/rhobf(k) * (dzf(km)*ekh(i,j,k) + dzf(k)*ekh(i,j,km)) &
-                    *  (a_in(i,j,k)-a_in(i,j,km)) / dzh(k)**2           )/dzf(k) &
+                    rhobh(k)/rhobf(k) * (dzf(k-1)*ekh(i,j,k) + dzf(k)*ekh(i,j,k-1)) &
+                    *  (a_in(i,j,k)-a_in(i,j,k-1)) / dzh(k)**2           )/dzf(k) &
                             )
 
         end do
@@ -664,13 +728,13 @@ subroutine diffe(a_out)
               ((ekm(i+1,j,k)+ekm(i,j,k))*(e120(i+1,j,k)-e120(i,j,k)) &
               -(ekm(i,j,k)+ekm(i-1,j,k))*(e120(i,j,k)-e120(i-1,j,k)))*dx2i * anis_fac(k) &
                   + &
-              ((ekm(i,jp,k)+ekm(i,j,k)) *(e120(i,jp,k)-e120(i,j,k)) &
-              -(ekm(i,j,k)+ekm(i,jm,k)) *(e120(i,j,k)-e120(i,jm,k)) )*dy2i * anis_fac(k) &
+              ((ekm(i,j+1,k)+ekm(i,j,k)) *(e120(i,j+1,k)-e120(i,j,k)) &
+              -(ekm(i,j,k)+ekm(i,j-1,k)) *(e120(i,j,k)-e120(i,j-1,k)) )*dy2i * anis_fac(k) &
                   + &
-              (rhobh(kp)/rhobf(k) * (dzf(kp)*ekm(i,j,k) + dzf(k)*ekm(i,j,kp)) &
-              *(e120(i,j,kp)-e120(i,j,k)) / dzh(kp)**2 &
-              - rhobh(k)/rhobf(k) * (dzf(km)*ekm(i,j,k) + dzf(k)*ekm(i,j,km)) &
-              *(e120(i,j,k)-e120(i,j,km)) / dzh(k)**2        )/dzf(k) &
+              (rhobh(k+1)/rhobf(k) * (dzf(k+1)*ekm(i,j,k) + dzf(k)*ekm(i,j,k+1)) &
+              *(e120(i,j,k+1)-e120(i,j,k)) / dzh(k+1)**2 &
+              - rhobh(k)/rhobf(k) * (dzf(k-1)*ekm(i,j,k) + dzf(k)*ekm(i,j,k-1)) &
+              *(e120(i,j,k)-e120(i,j,k-1)) / dzh(k)**2        )/dzf(k) &
                             )
 
         end do
@@ -714,28 +778,22 @@ subroutine diffu (a_out)
     integer             :: i,j,k,jm,jp,km,kp
 
     do k=2,kmax
-      kp=k+1
-      km=k-1
-
       do j=2,j1
-        jp=j+1
-        jm=j-1
-
         do i=2,i1
 
-          emom = ( dzf(km) * ( ekm(i,j,k)  + ekm(i-1,j,k)  )  + &
-                      dzf(k)  * ( ekm(i,j,km) + ekm(i-1,j,km) ) ) / &
+          emom = ( dzf(k-1) * ( ekm(i,j,k)  + ekm(i-1,j,k)  )  + &
+                      dzf(k)  * ( ekm(i,j,k-1) + ekm(i-1,j,k-1) ) ) / &
                     ( 4.   * dzh(k) )
 
-          emop = ( dzf(kp) * ( ekm(i,j,k)  + ekm(i-1,j,k)  )  + &
-                      dzf(k)  * ( ekm(i,j,kp) + ekm(i-1,j,kp) ) ) / &
-                    ( 4.   * dzh(kp) )
+          emop = ( dzf(k+1) * ( ekm(i,j,k)  + ekm(i-1,j,k)  )  + &
+                      dzf(k)  * ( ekm(i,j,k+1) + ekm(i-1,j,k+1) ) ) / &
+                    ( 4.   * dzh(k+1) )
 
           empo = 0.25 * ( &
-                  ekm(i,j,k)+ekm(i,jp,k)+ekm(i-1,jp,k)+ekm(i-1,j,k)  )
+                  ekm(i,j,k)+ekm(i,j+1,k)+ekm(i-1,j+1,k)+ekm(i-1,j,k)  )
 
           emmo = 0.25 * ( &
-                  ekm(i,j,k)+ekm(i,jm,k)+ekm(i-1,jm,k)+ekm(i-1,j,k)  )
+                  ekm(i,j,k)+ekm(i,j-1,k)+ekm(i-1,j-1,k)+ekm(i-1,j,k)  )
 
 
           a_out(i,j,k) = a_out(i,j,k) &
@@ -743,14 +801,14 @@ subroutine diffu (a_out)
                   ( ekm(i,j,k)  * (u0(i+1,j,k)-u0(i,j,k)) &
                     -ekm(i-1,j,k)* (u0(i,j,k)-u0(i-1,j,k)) ) * 2. * dx2i * anis_fac(k) &
                   + &
-                  ( empo * ( (u0(i,jp,k)-u0(i,j,k))   *dyi &
-                            +(v0(i,jp,k)-v0(i-1,jp,k))*dxi) &
-                    -emmo * ( (u0(i,j,k)-u0(i,jm,k))   *dyi &
+                  ( empo * ( (u0(i,j+1,k)-u0(i,j,k))   *dyi &
+                            +(v0(i,j+1,k)-v0(i-1,j+1,k))*dxi) &
+                    -emmo * ( (u0(i,j,k)-u0(i,j-1,k))   *dyi &
                             +(v0(i,j,k)-v0(i-1,j,k))  *dxi)   ) / dy * anis_fac(k) &
                   + &
-                  ( rhobh(kp)/rhobf(k) * emop * ( (u0(i,j,kp)-u0(i,j,k))   /dzh(kp) &
-                            +(w0(i,j,kp)-w0(i-1,j,kp))*dxi) &
-                    - rhobh(k)/rhobf(k) * emom * ( (u0(i,j,k)-u0(i,j,km))   /dzh(k) &
+                  ( rhobh(k+1)/rhobf(k) * emop * ( (u0(i,j,k+1)-u0(i,j,k))   /dzh(k+1) &
+                            +(w0(i,j,k+1)-w0(i-1,j,k+1))*dxi) &
+                    - rhobh(k)/rhobf(k) * emom * ( (u0(i,j,k)-u0(i,j,k-1))   /dzh(k) &
                             +(w0(i,j,k)-w0(i-1,j,k))  *dxi)   ) /dzf(k)
 
         end do
@@ -762,16 +820,13 @@ subroutine diffu (a_out)
   !     --------------------------------------------
 
     do j=2,j1
-      jp = j+1
-      jm = j-1
-
       do i=2,i1
 
         empo = 0.25 * ( &
-              ekm(i,j,1)+ekm(i,jp,1)+ekm(i-1,jp,1)+ekm(i-1,j,1)  )
+              ekm(i,j,1)+ekm(i,j+1,1)+ekm(i-1,j+1,1)+ekm(i-1,j,1)  )
 
         emmo = 0.25 * ( &
-              ekm(i,j,1)+ekm(i,jm,1)+ekm(i-1,jm,1)+ekm(i-1,j,1)  )
+              ekm(i,j,1)+ekm(i,j-1,1)+ekm(i-1,j-1,1)+ekm(i-1,j,1)  )
 
         emop = ( dzf(2) * ( ekm(i,j,1) + ekm(i-1,j,1) )  + &
                     dzf(1) * ( ekm(i,j,2) + ekm(i-1,j,2) ) ) / &
@@ -780,25 +835,28 @@ subroutine diffu (a_out)
 
         ucu   = 0.5*(u0(i,j,1)+u0(i+1,j,1))+cu
 
-        if(ucu >= 0.) then
-          upcu  = max(ucu,1.e-10)
-        else
-          upcu  = min(ucu,-1.e-10)
-        end if
-
+        !if(ucu >= 0.) then
+        !  upcu  = max(ucu,1.e-10)
+        !else
+        !  upcu  = min(ucu,-1.e-10)
+        !end if
+        
+        ! Branchless version of the algorithm above,
+        ! may or may not be more efficient
+        upcu = sign(1.,ucu) * max(abs(ucu),1.e-10)
 
         fu = ( 0.5*( ustar(i,j)+ustar(i-1,j) ) )**2  * &
                 upcu/sqrt(upcu**2  + &
-                ((v0(i,j,1)+v0(i-1,j,1)+v0(i,jp,1)+v0(i-1,jp,1))/4.+cv)**2)
+                ((v0(i,j,1)+v0(i-1,j,1)+v0(i,j+1,1)+v0(i-1,j+1,1))/4.+cv)**2)
 
         a_out(i,j,1) = a_out(i,j,1) &
                 + &
               ( ekm(i,j,1)  * (u0(i+1,j,1)-u0(i,j,1)) &
               -ekm(i-1,j,1)* (u0(i,j,1)-u0(i-1,j,1)) ) * 2. * dx2i * anis_fac(k) &
                 + &
-              ( empo * ( (u0(i,jp,1)-u0(i,j,1))   *dyi &
-                        +(v0(i,jp,1)-v0(i-1,jp,1))*dxi) &
-              -emmo * ( (u0(i,j,1)-u0(i,jm,1))   *dyi &
+              ( empo * ( (u0(i,j+1,1)-u0(i,j,1))   *dyi &
+                        +(v0(i,j+1,1)-v0(i-1,j+1,1))*dxi) &
+              -emmo * ( (u0(i,j,1)-u0(i,j-1,1))   *dyi &
                         +(v0(i,j,1)-v0(i-1,j,1))  *dxi)   ) / dy * anis_fac(k) &
                + &
               ( rhobh(2)/rhobf(1) * emop * ( (u0(i,j,2)-u0(i,j,1))    /dzh(2) &
@@ -825,44 +883,38 @@ subroutine diffv (a_out)
     integer             :: i,j,k,jm,jp,km,kp
 
     do k=2,kmax
-      kp=k+1
-      km=k-1
-
       do j=2,j1
-        jp=j+1
-        jm=j-1
-
         do i=2,i1
 
-          eomm = ( dzf(km) * ( ekm(i,j,k)  + ekm(i,jm,k)  )  + &
-                      dzf(k)  * ( ekm(i,j,km) + ekm(i,jm,km) ) ) / &
+          eomm = ( dzf(k-1) * ( ekm(i,j,k)  + ekm(i,j-1,k)  )  + &
+                      dzf(k)  * ( ekm(i,j,k-1) + ekm(i,j-1,k-1) ) ) / &
                     ( 4.   * dzh(k) )
 
-          eomp = ( dzf(kp) * ( ekm(i,j,k)  + ekm(i,jm,k)  )  + &
-                      dzf(k)  * ( ekm(i,j,kp) + ekm(i,jm,kp) ) ) / &
-                    ( 4.   * dzh(kp) )
+          eomp = ( dzf(k+1) * ( ekm(i,j,k)  + ekm(i,j-1,k)  )  + &
+                      dzf(k)  * ( ekm(i,j,k+1) + ekm(i,j-1,k+1) ) ) / &
+                    ( 4.   * dzh(k+1) )
 
           emmo = 0.25  * ( &
-                ekm(i,j,k)+ekm(i,jm,k)+ekm(i-1,jm,k)+ekm(i-1,j,k)  )
+                ekm(i,j,k)+ekm(i,j-1,k)+ekm(i-1,j-1,k)+ekm(i-1,j,k)  )
 
           epmo = 0.25  * ( &
-                ekm(i,j,k)+ekm(i,jm,k)+ekm(i+1,jm,k)+ekm(i+1,j,k)  )
+                ekm(i,j,k)+ekm(i,j-1,k)+ekm(i+1,j-1,k)+ekm(i+1,j,k)  )
 
 
         a_out(i,j,k) = a_out(i,j,k) &
                 + &
               ( epmo * ( (v0(i+1,j,k)-v0(i,j,k))   *dxi &
-                        +(u0(i+1,j,k)-u0(i+1,jm,k))*dyi) &
+                        +(u0(i+1,j,k)-u0(i+1,j-1,k))*dyi) &
                 -emmo * ( (v0(i,j,k)-v0(i-1,j,k))   *dxi &
-                        +(u0(i,j,k)-u0(i,jm,k))    *dyi)   ) / dx * anis_fac(k) &
+                        +(u0(i,j,k)-u0(i,j-1,k))    *dyi)   ) / dx * anis_fac(k) &
                 + &
-              (ekm(i,j,k) * (v0(i,jp,k)-v0(i,j,k)) &
-              -ekm(i,jm,k)* (v0(i,j,k)-v0(i,jm,k))  ) * 2. * dy2i * anis_fac(k) &
+              (ekm(i,j,k) * (v0(i,j+1,k)-v0(i,j,k)) &
+              -ekm(i,j-1,k)* (v0(i,j,k)-v0(i,j-1,k))  ) * 2. * dy2i * anis_fac(k) &
                 + &
-              ( rhobh(kp)/rhobf(k) * eomp * ( (v0(i,j,kp)-v0(i,j,k))    /dzh(kp) &
-                        +(w0(i,j,kp)-w0(i,jm,kp))  *dyi) &
-                - rhobh(k)/rhobf(k) * eomm * ( (v0(i,j,k)-v0(i,j,km))    /dzh(k) &
-                        +(w0(i,j,k)-w0(i,jm,k))    *dyi)   ) / dzf(k)
+              ( rhobh(k+1)/rhobf(k) * eomp * ( (v0(i,j,k+1)-v0(i,j,k))    /dzh(k+1) &
+                        +(w0(i,j,k+1)-w0(i,j-1,k+1))  *dyi) &
+                - rhobh(k)/rhobf(k) * eomm * ( (v0(i,j,k)-v0(i,j,k-1))    /dzh(k) &
+                        +(w0(i,j,k)-w0(i,j-1,k))    *dyi)   ) / dzf(k)
 
         end do
       end do
@@ -873,44 +925,45 @@ subroutine diffv (a_out)
   !     --------------------------------------------
 
     do j=2,j1
-      jp = j+1
-      jm = j-1
       do i=2,i1
 
         emmo = 0.25 * ( &
-              ekm(i,j,1)+ekm(i,jm,1)+ekm(i-1,jm,1)+ekm(i-1,j,1)  )
+              ekm(i,j,1)+ekm(i,j-1,1)+ekm(i-1,j-1,1)+ekm(i-1,j,1)  )
 
         epmo = 0.25  * ( &
-              ekm(i,j,1)+ekm(i,jm,1)+ekm(i+1,jm,1)+ekm(i+1,j,1)  )
+              ekm(i,j,1)+ekm(i,j-1,1)+ekm(i+1,j-1,1)+ekm(i+1,j,1)  )
 
-        eomp = ( dzf(2) * ( ekm(i,j,1) + ekm(i,jm,1)  )  + &
-                    dzf(1) * ( ekm(i,j,2) + ekm(i,jm,2) ) ) / &
+        eomp = ( dzf(2) * ( ekm(i,j,1) + ekm(i,j-1,1)  )  + &
+                    dzf(1) * ( ekm(i,j,2) + ekm(i,j-1,2) ) ) / &
                   ( 4.   * dzh(2) )
 
         vcv   = 0.5*(v0(i,j,1)+v0(i,j+1,1))+cv
-        if(vcv >= 0.) then
-          vpcv  = max(vcv,1.e-10)
-        else
-          vpcv  = min(vcv,-1.e-10)
-        end if
+
+        !if(vcv >= 0.) then
+        !  vpcv  = max(vcv,1.e-10)
+        !else
+        !  vpcv  = min(vcv,-1.e-10)
+        !end if
+
+        vpcv = sign(1., vcv) * max(abs(vcv),1.e-10)
 
 
         fv    = ( 0.5*( ustar(i,j)+ustar(i,j-1) ) )**2  * &
                     vpcv/sqrt(vpcv**2  + &
-                ((u0(i,j,1)+u0(i+1,j,1)+u0(i,jm,1)+u0(i+1,jm,1))/4.+cu)**2)
+                ((u0(i,j,1)+u0(i+1,j,1)+u0(i,j-1,1)+u0(i+1,j-1,1))/4.+cu)**2)
 
         a_out(i,j,1) = a_out(i,j,1) &
                   + &
                   ( epmo * ( (v0(i+1,j,1)-v0(i,j,1))   *dxi &
-                            +(u0(i+1,j,1)-u0(i+1,jm,1))*dyi) &
+                            +(u0(i+1,j,1)-u0(i+1,j-1,1))*dyi) &
                     -emmo * ( (v0(i,j,1)-v0(i-1,j,1))   *dxi &
-                            +(u0(i,j,1)-u0(i,jm,1))    *dyi)   ) / dx * anis_fac(k) &
+                            +(u0(i,j,1)-u0(i,j-1,1))    *dyi)   ) / dx * anis_fac(k) &
                   + &
-                ( ekm(i,j,1) * (v0(i,jp,1)-v0(i,j,1)) &
-                  -ekm(i,jm,1)* (v0(i,j,1)-v0(i,jm,1))  ) * 2. * dy2i * anis_fac(k) &
+                ( ekm(i,j,1) * (v0(i,j+1,1)-v0(i,j,1)) &
+                  -ekm(i,j-1,1)* (v0(i,j,1)-v0(i,j-1,1))  ) * 2. * dy2i * anis_fac(k) &
                   + &
                 ( rhobh(2)/rhobf(1) * eomp * ( (v0(i,j,2)-v0(i,j,1))     /dzh(2) &
-                          +(w0(i,j,2)-w0(i,jm,2))    *dyi) &
+                          +(w0(i,j,2)-w0(i,j-1,2))    *dyi) &
                   -rhobh(1)/rhobf(1)*fv   ) / dzf(1)
 
       end do
@@ -933,44 +986,40 @@ subroutine diffw(a_out)
     integer             :: i,j,k,jm,jp,km,kp
 
     do k=2,kmax
-      kp=k+1
-      km=k-1
       do j=2,j1
-        jp=j+1
-        jm=j-1
         do i=2,i1
 
-          emom = ( dzf(km) * ( ekm(i,j,k)  + ekm(i-1,j,k)  )  + &
-                      dzf(k)  * ( ekm(i,j,km) + ekm(i-1,j,km) ) ) / &
+          emom = ( dzf(k-1) * ( ekm(i,j,k)  + ekm(i-1,j,k)  )  + &
+                      dzf(k)  * ( ekm(i,j,k-1) + ekm(i-1,j,k-1) ) ) / &
                     ( 4.   * dzh(k) )
 
-          eomm = ( dzf(km) * ( ekm(i,j,k)  + ekm(i,jm,k)  )  + &
-                      dzf(k)  * ( ekm(i,j,km) + ekm(i,jm,km) ) ) / &
+          eomm = ( dzf(k-1) * ( ekm(i,j,k)  + ekm(i,j-1,k)  )  + &
+                      dzf(k)  * ( ekm(i,j,k-1) + ekm(i,j-1,k-1) ) ) / &
                     ( 4.   * dzh(k) )
 
-          eopm = ( dzf(km) * ( ekm(i,j,k)  + ekm(i,jp,k)  )  + &
-                      dzf(k)  * ( ekm(i,j,km) + ekm(i,jp,km) ) ) / &
+          eopm = ( dzf(k-1) * ( ekm(i,j,k)  + ekm(i,j+1,k)  )  + &
+                      dzf(k)  * ( ekm(i,j,k-1) + ekm(i,j+1,k-1) ) ) / &
                     ( 4.   * dzh(k) )
 
-          epom = ( dzf(km) * ( ekm(i,j,k)  + ekm(i+1,j,k)  )  + &
-                      dzf(k)  * ( ekm(i,j,km) + ekm(i+1,j,km) ) ) / &
+          epom = ( dzf(k-1) * ( ekm(i,j,k)  + ekm(i+1,j,k)  )  + &
+                      dzf(k)  * ( ekm(i,j,k-1) + ekm(i+1,j,k-1) ) ) / &
                     ( 4.   * dzh(k) )
 
 
           a_out(i,j,k) = a_out(i,j,k) &
                 + &
                   ( epom * ( (w0(i+1,j,k)-w0(i,j,k))    *dxi &
-                            +(u0(i+1,j,k)-u0(i+1,j,km)) /dzh(k) ) &
+                            +(u0(i+1,j,k)-u0(i+1,j,k-1)) /dzh(k) ) &
                     -emom * ( (w0(i,j,k)-w0(i-1,j,k))    *dxi &
-                            +(u0(i,j,k)-u0(i,j,km))     /dzh(k) ))/dx * anis_fac(k) &
+                            +(u0(i,j,k)-u0(i,j,k-1))     /dzh(k) ))/dx * anis_fac(k) &
                 + &
-                  ( eopm * ( (w0(i,jp,k)-w0(i,j,k))     *dyi &
-                            +(v0(i,jp,k)-v0(i,jp,km))   /dzh(k) ) &
-                    -eomm * ( (w0(i,j,k)-w0(i,jm,k))     *dyi &
-                            +(v0(i,j,k)-v0(i,j,km))     /dzh(k) ))/dy * anis_fac(k) &
+                  ( eopm * ( (w0(i,j+1,k)-w0(i,j,k))     *dyi &
+                            +(v0(i,j+1,k)-v0(i,j+1,k-1))   /dzh(k) ) &
+                    -eomm * ( (w0(i,j,k)-w0(i,j-1,k))     *dyi &
+                            +(v0(i,j,k)-v0(i,j,k-1))     /dzh(k) ))/dy * anis_fac(k) &
                 + (1./rhobh(k))*&
-                  ( rhobf(k) * ekm(i,j,k) * (w0(i,j,kp)-w0(i,j,k)) /dzf(k) &
-                  - rhobf(km) * ekm(i,j,km)* (w0(i,j,k)-w0(i,j,km)) /dzf(km) ) * 2. &
+                  ( rhobf(k) * ekm(i,j,k) * (w0(i,j,k+1)-w0(i,j,k)) /dzf(k) &
+                  - rhobf(k-1) * ekm(i,j,k-1)* (w0(i,j,k)-w0(i,j,k-1)) /dzf(k-1) ) * 2. &
                                                               / dzh(k)
 
         end do

From 135334fbffb7ecfd34090495ac0a8f9258f51421 Mon Sep 17 00:00:00 2001
From: Caspar Jungbacker <caspar.jungbacker@outlook.com>
Date: Tue, 13 Jun 2023 09:30:54 +0200
Subject: [PATCH 11/19] Change directives

---
 src/modmpi.f90 | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/modmpi.f90 b/src/modmpi.f90
index eca1166d5..fe6a1bc09 100644
--- a/src/modmpi.f90
+++ b/src/modmpi.f90
@@ -35,7 +35,7 @@
 
 module modmpi
 use modmpiinterface
-#ifdef _OPENACC
+#if defined(_OPENACC)
 use openacc
 #endif
 implicit none
@@ -348,7 +348,7 @@ subroutine excjs_real32(a,sx,ex,sy,ey,sz,ez,ih,jh)
                                           , sende,recve &
                                           , sendw,recvw
   ! Check if the data is on the gpu
-  #ifdef _OPENACC
+  #if defined(_OPENACC)
   integer :: is_present
   is_present = acc_is_present(a)
   #endif
@@ -470,7 +470,7 @@ subroutine excjs_real64(a,sx,ex,sy,ey,sz,ez,ih,jh)
                                           , sende,recve &
                                           , sendw,recvw
   ! Check if the data is on the gpu
-  #ifdef _OPENACC
+  #if defined(_OPENACC)
   integer :: is_present
   is_present = acc_is_present(a)
   #endif

From d8bf150c898fbb2d71770d4d76f1b83938881799 Mon Sep 17 00:00:00 2001
From: Caspar Jungbacker <caspar.jungbacker@outlook.com>
Date: Tue, 13 Jun 2023 10:01:32 +0200
Subject: [PATCH 12/19] Remove unused loop indices

---
 src/modsubgrid.f90 | 28 +++++++---------------------
 1 file changed, 7 insertions(+), 21 deletions(-)

diff --git a/src/modsubgrid.f90 b/src/modsubgrid.f90
index 334541b5e..254388bc0 100644
--- a/src/modsubgrid.f90
+++ b/src/modsubgrid.f90
@@ -440,7 +440,7 @@ subroutine sources
   implicit none
 
   real    tdef2, uwflux, vwflux, local_dudz, local_dvdz, local_dthvdz, horv
-  integer i,j,k,jm,jp,km,kp
+  integer i,j,k
 
   do k=2,kmax
     do j=2,j1
@@ -651,16 +651,10 @@ subroutine diffc (a_in,a_out,flux)
     real(field_r), intent(inout) :: a_out(2-ih:i1+ih,2-jh:j1+jh,k1)
     real, intent(in)    :: flux (i2,j2)
 
-    integer i,j,k,jm,jp,km,kp
-
+    integer i,j,k
+    
     do k=2,kmax
-      kp=k+1
-      km=k-1
-
       do j=2,j1
-        jp=j+1
-        jm=j-1
-
         do i=2,i1
           a_out(i,j,k) = a_out(i,j,k) &
                     +  0.5 * ( &
@@ -711,16 +705,9 @@ subroutine diffe(a_out)
     implicit none
 
     real(field_r), intent(inout) :: a_out(2-ih:i1+ih,2-jh:j1+jh,k1)
-    integer             :: i,j,k,jm,jp,km,kp
-
+    integer             :: i,j,k
     do k=2,kmax
-      kp=k+1
-      km=k-1
-
       do j=2,j1
-        jp=j+1
-        jm=j-1
-
         do i=2,i1
 
           a_out(i,j,k) = a_out(i,j,k) &
@@ -775,7 +762,7 @@ subroutine diffu (a_out)
     real                :: emmo,emom,emop,empo
     real                :: fu
     real                :: ucu, upcu
-    integer             :: i,j,k,jm,jp,km,kp
+    integer             :: i,j,k
 
     do k=2,kmax
       do j=2,j1
@@ -880,7 +867,7 @@ subroutine diffv (a_out)
     real(field_r), intent(inout) :: a_out(2-ih:i1+ih,2-jh:j1+jh,k1)
     real                :: emmo, eomm,eomp,epmo
     real                :: fv, vcv,vpcv
-    integer             :: i,j,k,jm,jp,km,kp
+    integer             :: i,j,k
 
     do k=2,kmax
       do j=2,j1
@@ -983,8 +970,7 @@ subroutine diffw(a_out)
 
     real(field_r), intent(inout) :: a_out(2-ih:i1+ih,2-jh:j1+jh,k1)
     real                :: emom, eomm, eopm, epom
-    integer             :: i,j,k,jm,jp,km,kp
-
+    integer             :: i,j,k
     do k=2,kmax
       do j=2,j1
         do i=2,i1

From dd5a67e32baef6efc9d9980e62923e26fa1c0006 Mon Sep 17 00:00:00 2001
From: Caspar Jungbacker <caspar.jungbacker@outlook.com>
Date: Tue, 13 Jun 2023 10:04:36 +0200
Subject: [PATCH 13/19] Update precision.yml

---
 .github/workflows/precision.yml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.github/workflows/precision.yml b/.github/workflows/precision.yml
index 4a073a8fc..48e3f340f 100644
--- a/.github/workflows/precision.yml
+++ b/.github/workflows/precision.yml
@@ -4,7 +4,13 @@ name: DALES precision CI
 on:
   # Triggers the workflow on push or pull request events but only for the feature/single_precision branch
   push:
+    branches:
+      - master
+      - OpenACC-Dev
   pull_request:
+    branches:
+      - master
+      - OpenACC-Dev
   # Allows you to run this workflow manually from the Actions tab
   workflow_dispatch:
 

From d70ddc26c8dd0c254a96d5e33f3bd087c856b0ff Mon Sep 17 00:00:00 2001
From: Caspar Jungbacker <caspar.jungbacker@outlook.com>
Date: Tue, 13 Jun 2023 10:09:03 +0200
Subject: [PATCH 14/19] Offload loops

---
 src/modsubgrid.f90 | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/src/modsubgrid.f90 b/src/modsubgrid.f90
index 254388bc0..3aad7b82a 100644
--- a/src/modsubgrid.f90
+++ b/src/modsubgrid.f90
@@ -232,6 +232,7 @@ subroutine closure
     call timer_tic("Smagorinsky", 1)
     ! First level
     mlen = csz(1) * delta(1)
+    !$acc parallel loop collapse(2) private(strain2) async(1)
     do i = 2,i1
       do j = 2,j1
         strain2 =  ( &
@@ -266,8 +267,11 @@ subroutine closure
     end do
     
     ! Other levels
+    ! TODO: make this collapsable
+    !$acc loop private(mlen)
     do k = 2,kmax
       mlen = csz(k) * delta(k)
+      !$acc parallel loop collapse(2) private(strain2) async(1)
       do i = 2,i1
         do j = 2,j1
           strain2 =  ( &
@@ -318,6 +322,7 @@ subroutine closure
  else
     ! choose one of ldelta, ldelta+lmason, lanisotropic, or none of them for Deardorff length scale adjustment
     if (ldelta .and. .not. lmason) then
+       !$acc parallel loop collapse(3) default(present)
        do k=1,kmax
           do j=2,j1
              do i=2,i1
@@ -332,6 +337,7 @@ subroutine closure
           end do
        end do
     else if (ldelta .and. lmason) then ! delta scheme with Mason length scale correction
+       !$acc parallel loop collapse(3) default(present)
        do k=1,kmax
           do j=2,j1
              do i=2,i1
@@ -347,6 +353,7 @@ subroutine closure
           end do
        end do
     else if (lanisotrop) then ! Anisotropic diffusion,  https://doi.org/10.1029/2022MS003095
+       !$acc parallel loop collapse(3) default(present)
        do k=1,kmax
           do j=2,j1
              do i=2,i1
@@ -361,6 +368,7 @@ subroutine closure
           end do
        end do
     else ! Deardorff lengthscale correction
+       !$acc parallel loop collapse(3) default(present)
        do k=1,kmax
           do j=2,j1
              do i=2,i1
@@ -398,6 +406,7 @@ subroutine closure
   call timer_toc("Boundary conditions") 
 
   call timer_tic("Write buffer", 3)
+  !$acc parallel loop collapse(2) default(present)
   do j=1,j2
     do i=1,i2
       ekm(i,j,k1)  = ekm(i,j,kmax)
@@ -442,6 +451,7 @@ subroutine sources
   real    tdef2, uwflux, vwflux, local_dudz, local_dvdz, local_dthvdz, horv
   integer i,j,k
 
+  !$acc parallel loop collapse(3) default(present) private(tdef2)
   do k=2,kmax
     do j=2,j1
       do i=2,i1
@@ -493,6 +503,7 @@ subroutine sources
 !     --------------------------------------------
 
   if (sgs_surface_fix) then
+    !$acc parallel loop collapse(2) default(present) private(tdef2,horv,uwflux,vwflux,local_dudz,local_dvdz,local_dthvdz)
     do j=2,j1
       do i=2,i1
         tdef2 = 2. * ( &
@@ -542,6 +553,7 @@ subroutine sources
       end do
     end do
   else
+    !$acc parallel loop collapse(2) default(present) private(tdef2)
     do j=2,j1
       do i=2,i1
         tdef2 = 2. * ( &
@@ -634,9 +646,11 @@ subroutine sources
 !    sbdiss(i,j,1) = - (ce1 + ce2*zlt(i,j,1)*deltai(1)) * e120(i,j,1)**2 /(2.*zlt(i,j,1))
 !  end do
 !  end do
-
+  
+  !$acc kernels default(present)
   e12p(2:i1,2:j1,1) = e12p(2:i1,2:j1,1) + &
             sbshr(2:i1,2:j1,1)+sbbuo(2:i1,2:j1,1)+sbdiss(2:i1,2:j1,1)  
+  !$acc end kernels
 
   return
   end subroutine sources
@@ -653,6 +667,7 @@ subroutine diffc (a_in,a_out,flux)
 
     integer i,j,k
     
+    !$acc parallel loop collapse(3) default(present)
     do k=2,kmax
       do j=2,j1
         do i=2,i1
@@ -674,7 +689,8 @@ subroutine diffc (a_in,a_out,flux)
         end do
       end do
     end do
-
+    
+    !$acc parallel loop collapse(2) default(present)
     do j=2,j1
       do i=2,i1
 
@@ -706,6 +722,8 @@ subroutine diffe(a_out)
 
     real(field_r), intent(inout) :: a_out(2-ih:i1+ih,2-jh:j1+jh,k1)
     integer             :: i,j,k
+    
+    !$acc parallel loop collapse(3) default(present)
     do k=2,kmax
       do j=2,j1
         do i=2,i1
@@ -732,6 +750,7 @@ subroutine diffe(a_out)
   !     special treatment for lowest full level: k=1
   !     --------------------------------------------
 
+    !$acc parallel loop collapse(2)
     do j=2,j1
       do i=2,i1
 
@@ -764,6 +783,7 @@ subroutine diffu (a_out)
     real                :: ucu, upcu
     integer             :: i,j,k
 
+    !$acc parallel loop collapse(3) default(present) private(emom, emop, empo, emmo)
     do k=2,kmax
       do j=2,j1
         do i=2,i1
@@ -806,6 +826,7 @@ subroutine diffu (a_out)
   !     special treatment for lowest full level: k=1
   !     --------------------------------------------
 
+    !$acc parallel loop collapse(2) default(present) private(empo, emmo, emop, ucu, upcu, fu)
     do j=2,j1
       do i=2,i1
 
@@ -869,6 +890,7 @@ subroutine diffv (a_out)
     real                :: fv, vcv,vpcv
     integer             :: i,j,k
 
+    !$acc parallel loop collapse(3) default(present) private(eomm, eomp, emmo, epmo)
     do k=2,kmax
       do j=2,j1
         do i=2,i1
@@ -911,6 +933,7 @@ subroutine diffv (a_out)
   !     special treatment for lowest full level: k=1
   !     --------------------------------------------
 
+    !$acc parallel loop collapse(2) default(present) private(emmo, epmo, eomp, vcv, vpcv, fv)
     do j=2,j1
       do i=2,i1
 
@@ -971,6 +994,8 @@ subroutine diffw(a_out)
     real(field_r), intent(inout) :: a_out(2-ih:i1+ih,2-jh:j1+jh,k1)
     real                :: emom, eomm, eopm, epom
     integer             :: i,j,k
+    
+    !$acc parallel loop collapse(3) default(present) private(emom, eomm, eopm, epom)
     do k=2,kmax
       do j=2,j1
         do i=2,i1

From 4f68a356d28f0d03e16efb4c278912f176b03bed Mon Sep 17 00:00:00 2001
From: Caspar Jungbacker <caspar.jungbacker@outlook.com>
Date: Tue, 13 Jun 2023 14:04:59 +0200
Subject: [PATCH 15/19] Fix anis_fac index

---
 src/modsubgrid.f90 | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/modsubgrid.f90 b/src/modsubgrid.f90
index 3aad7b82a..23d130671 100644
--- a/src/modsubgrid.f90
+++ b/src/modsubgrid.f90
@@ -697,10 +697,10 @@ subroutine diffc (a_in,a_out,flux)
         a_out(i,j,1) = a_out(i,j,1) &
                   + 0.5 * ( &
                 ( (ekh(i+1,j,1)+ekh(i,j,1))*(a_in(i+1,j,1)-a_in(i,j,1)) &
-                  -(ekh(i,j,1)+ekh(i-1,j,1))*(a_in(i,j,1)-a_in(i-1,j,1)) )*dx2i * anis_fac(k) &
+                  -(ekh(i,j,1)+ekh(i-1,j,1))*(a_in(i,j,1)-a_in(i-1,j,1)) )*dx2i * anis_fac(1) &
                   + &
                 ( (ekh(i,j+1,1)+ekh(i,j,1))*(a_in(i,j+1,1)-a_in(i,j,1)) &
-                  -(ekh(i,j,1)+ekh(i,j-1,1))*(a_in(i,j,1)-a_in(i,j-1,1)) )*dy2i * anis_fac(k) &
+                  -(ekh(i,j,1)+ekh(i,j-1,1))*(a_in(i,j,1)-a_in(i,j-1,1)) )*dy2i * anis_fac(1) &
                   + &
                 ( rhobh(2)/rhobf(1) * (dzf(2)*ekh(i,j,1) + dzf(1)*ekh(i,j,2)) &
                   *  (a_in(i,j,2)-a_in(i,j,1)) / dzh(2)**2 &
@@ -756,10 +756,10 @@ subroutine diffe(a_out)
 
         a_out(i,j,1) = a_out(i,j,1) + &
             ( (ekm(i+1,j,1)+ekm(i,j,1))*(e120(i+1,j,1)-e120(i,j,1)) &
-              -(ekm(i,j,1)+ekm(i-1,j,1))*(e120(i,j,1)-e120(i-1,j,1)) )*dx2i * anis_fac(k) &
+              -(ekm(i,j,1)+ekm(i-1,j,1))*(e120(i,j,1)-e120(i-1,j,1)) )*dx2i * anis_fac(1) &
             + &
             ( (ekm(i,j+1,1)+ekm(i,j,1))*(e120(i,j+1,1)-e120(i,j,1)) &
-              -(ekm(i,j,1)+ekm(i,j-1,1))*(e120(i,j,1)-e120(i,j-1,1)) )*dy2i * anis_fac(k) &
+              -(ekm(i,j,1)+ekm(i,j-1,1))*(e120(i,j,1)-e120(i,j-1,1)) )*dy2i * anis_fac(1) &
             + &
               ( rhobh(2)/rhobf(1) * (dzf(2)*ekm(i,j,1) + dzf(1)*ekm(i,j,2)) &
               *  (e120(i,j,2)-e120(i,j,1)) / dzh(2)**2              )/dzf(1)
@@ -860,12 +860,12 @@ subroutine diffu (a_out)
         a_out(i,j,1) = a_out(i,j,1) &
                 + &
               ( ekm(i,j,1)  * (u0(i+1,j,1)-u0(i,j,1)) &
-              -ekm(i-1,j,1)* (u0(i,j,1)-u0(i-1,j,1)) ) * 2. * dx2i * anis_fac(k) &
+              -ekm(i-1,j,1)* (u0(i,j,1)-u0(i-1,j,1)) ) * 2. * dx2i * anis_fac(1) &
                 + &
               ( empo * ( (u0(i,j+1,1)-u0(i,j,1))   *dyi &
                         +(v0(i,j+1,1)-v0(i-1,j+1,1))*dxi) &
               -emmo * ( (u0(i,j,1)-u0(i,j-1,1))   *dyi &
-                        +(v0(i,j,1)-v0(i-1,j,1))  *dxi)   ) / dy * anis_fac(k) &
+                        +(v0(i,j,1)-v0(i-1,j,1))  *dxi)   ) / dy * anis_fac(1) &
                + &
               ( rhobh(2)/rhobf(1) * emop * ( (u0(i,j,2)-u0(i,j,1))    /dzh(2) &
                         +(w0(i,j,2)-w0(i-1,j,2))  *dxi) &
@@ -967,10 +967,10 @@ subroutine diffv (a_out)
                   ( epmo * ( (v0(i+1,j,1)-v0(i,j,1))   *dxi &
                             +(u0(i+1,j,1)-u0(i+1,j-1,1))*dyi) &
                     -emmo * ( (v0(i,j,1)-v0(i-1,j,1))   *dxi &
-                            +(u0(i,j,1)-u0(i,j-1,1))    *dyi)   ) / dx * anis_fac(k) &
+                            +(u0(i,j,1)-u0(i,j-1,1))    *dyi)   ) / dx * anis_fac(1) &
                   + &
                 ( ekm(i,j,1) * (v0(i,j+1,1)-v0(i,j,1)) &
-                  -ekm(i,j-1,1)* (v0(i,j,1)-v0(i,j-1,1))  ) * 2. * dy2i * anis_fac(k) &
+                  -ekm(i,j-1,1)* (v0(i,j,1)-v0(i,j-1,1))  ) * 2. * dy2i * anis_fac(1) &
                   + &
                 ( rhobh(2)/rhobf(1) * eomp * ( (v0(i,j,2)-v0(i,j,1))     /dzh(2) &
                           +(w0(i,j,2)-w0(i,j-1,2))    *dyi) &

From be059b5f506a302150acbd4c58363af67d9b05d7 Mon Sep 17 00:00:00 2001
From: Caspar Jungbacker <caspar.jungbacker@outlook.com>
Date: Tue, 13 Jun 2023 14:50:18 +0200
Subject: [PATCH 16/19] Add/update data clauses

---
 src/modadvection.f90 |  3 +--
 src/modsubgrid.f90   | 26 +++++++++++++++++++++-----
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/src/modadvection.f90 b/src/modadvection.f90
index 573247b7b..058f44c95 100644
--- a/src/modadvection.f90
+++ b/src/modadvection.f90
@@ -47,7 +47,7 @@ subroutine advection
 
   ! leq = .false. ! for testing that the non-uniform advection routines agree with the uniform ones
                   ! when the grid is uniform
-  !$acc data copyin(u0, v0, w0, e120, thl0, qt0, dzf, dzh, rhobf, rhobh) copy(up, vp, wp, e12p, thlp, qtp)
+  !$acc enter data copyin(u0, v0, w0, e120, thl0, qt0, dzf, dzh, rhobf, rhobh, up, vp, wp, e12p, thlp, qtp)
   select case(iadv_mom)
     case(iadv_cd2)
       call advecu_2nd(u0,up)
@@ -212,6 +212,5 @@ subroutine advection
     end select
   end do
   !$acc wait
-  !$acc end data
 end subroutine advection
 end module modadvection
diff --git a/src/modsubgrid.f90 b/src/modsubgrid.f90
index 23d130671..acc5b2fac 100644
--- a/src/modsubgrid.f90
+++ b/src/modsubgrid.f90
@@ -114,7 +114,8 @@ subroutine initsubgrid
       write (6,*) 'cs    = ',cs
       write (6,*) 'Rigc  = ',Rigc
     endif
-
+  
+  !$acc enter data create(anis_fac)
   end subroutine initsubgrid
 
   subroutine subgridnamelist
@@ -158,15 +159,24 @@ subroutine subgrid
  ! Diffusion subroutines
 ! Thijs Heus, Chiel van Heerwaarden, 15 June 2007
 
-    use modglobal, only : nsv, lmoist
-    use modfields, only : up,vp,wp,e12p,thl0,thlp,qt0,qtp,sv0,svp
-    use modsurfdata,only : thlflux,qtflux,svflux
+    use modglobal, only : nsv, lmoist, deltai, delta, dzf, dzh
+    use modfields, only : up,vp,wp,e12p,thl0,thlp,qt0,qtp,sv0,svp,u0,v0,w0,rhobh,rhobf,e120,dthvdz, thvf
+    use modsurfdata,only : thlflux,qtflux,svflux,ustar,dudz,dvdz
+
     implicit none
     integer n
+
+    ! Already on GPU: u0, v0, w0, 120, thl0, qt0, dzf, dzh, rhobf, rhobh,
+    ! up, vp, wp, e12p, thlp, qtp
     
+    !$acc enter data copyin(thvf, thlflux, dthvdz, qtflux, delta, deltai, ustar)
+    !$acc enter data copyin(dudz, dvdz)
+    !$acc enter data copyin(ekm, ekh, zlt, sbdiss, sbbuo, sbshr, csz) 
+
     call timer_tic("closure", 0)
     call closure
     call timer_toc("closure")
+    !$acc wait
     call diffu(up)
     call diffv(vp)
     call diffw(wp)
@@ -177,11 +187,17 @@ subroutine subgrid
       call diffc(sv0(:,:,:,n),svp(:,:,:,n),svflux(:,:,n))
     end do
     if (.not. lsmagorinsky) call sources
+
+    !$acc exit data copyout(up, vp, wp)
+    !$acc exit data copyout(e12p, thlp, qtp)
+    !$acc exit data copyout(ekm, ekh, zlt)
+    !$acc exit data delete(u0, v0, w0, e120, thl0, qt0, dzf, dzh, rhobf, rhobh)
+    !$acc exit data delete(csz)
   end subroutine
 
   subroutine exitsubgrid
     implicit none
-    deallocate(ekm,ekh,zlt,sbdiss,sbbuo,sbshr,csz)
+    deallocate(ekm,ekh,zlt,sbdiss,sbbuo,sbshr,csz,anis_fac)
   end subroutine exitsubgrid
 
    subroutine closure

From c270f403f7b3001f7300cccf6f5b5f7e91664b79 Mon Sep 17 00:00:00 2001
From: Caspar Jungbacker <caspar.jungbacker@outlook.com>
Date: Tue, 13 Jun 2023 15:26:03 +0200
Subject: [PATCH 17/19] Update exchange kernels

---
 src/modmpi.f90 | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/src/modmpi.f90 b/src/modmpi.f90
index fe6a1bc09..986cdf0a6 100644
--- a/src/modmpi.f90
+++ b/src/modmpi.f90
@@ -335,6 +335,9 @@ subroutine exitmpi
   end subroutine exitmpi
 
   subroutine excjs_real32(a,sx,ex,sy,ey,sz,ez,ih,jh)
+  #if defined(_OPENACC)
+  use openacc
+  #endif
   implicit none
   integer sx, ex, sy, ey, sz, ez, ih, jh
   real(real32) a(sx-ih:ex+ih, sy-jh:ey+jh, sz:ez)
@@ -354,7 +357,7 @@ subroutine excjs_real32(a,sx,ex,sy,ey,sz,ez,ih,jh)
   #endif
 
 ! Calulate buffer lengths
-  !$acc kernels if(is_present) async
+  !$acc kernels default(present) if(is_present)
   xl = size(a,1)
   yl = size(a,2)
   zl = size(a,3)
@@ -392,7 +395,7 @@ subroutine excjs_real32(a,sx,ex,sy,ey,sz,ez,ih,jh)
   else
 
     ! Single processor, make sure the field is periodic
-    !$acc kernels default(present)
+    !$acc kernels default(present) if(is_present)
     a(:,sy-jh:sy-1,:) = a(:,ey-jh+1:ey,:)
     a(:,ey+1:ey+jh,:) = a(:,sy:sy+jh-1,:)
     !$acc end kernels
@@ -426,7 +429,7 @@ subroutine excjs_real32(a,sx,ex,sy,ey,sz,ez,ih,jh)
   else
 
     ! Single processor, make sure the field is periodic
-    !$acc kernels default(present)
+    !$acc kernels default(present) if(is_present)
     a(sx-ih:sx-1,:,:) = a(ex-ih+1:ex,:,:)
     a(ex+1:ex+ih,:,:) = a(sx:sx+ih-1,:,:)
     !$acc end kernels
@@ -457,6 +460,9 @@ subroutine excjs_real32(a,sx,ex,sy,ey,sz,ez,ih,jh)
   end subroutine excjs_real32
 
   subroutine excjs_real64(a,sx,ex,sy,ey,sz,ez,ih,jh)
+  #if defined(_OPENACC)
+  use openacc
+  #endif
   implicit none
   integer sx, ex, sy, ey, sz, ez, ih, jh
   real(real64) a(sx-ih:ex+ih, sy-jh:ey+jh, sz:ez)
@@ -476,7 +482,7 @@ subroutine excjs_real64(a,sx,ex,sy,ey,sz,ez,ih,jh)
   #endif
 
 ! Calulate buffer lengths
-  !$acc kernels default(present) if(is_present) async(1)
+  !$acc kernels default(present) if(is_present)
   xl = size(a,1)
   yl = size(a,2)
   zl = size(a,3)
@@ -515,7 +521,7 @@ subroutine excjs_real64(a,sx,ex,sy,ey,sz,ez,ih,jh)
   else
 
     ! Single processor, make sure the field is periodic
-    !$acc kernels default(present) if(is_present) async(1)
+    !$acc kernels default(present) if(is_present)
     a(:,sy-jh:sy-1,:) = a(:,ey-jh+1:ey,:)
     a(:,ey+1:ey+jh,:) = a(:,sy:sy+jh-1,:)
     !$acc end kernels
@@ -549,7 +555,7 @@ subroutine excjs_real64(a,sx,ex,sy,ey,sz,ez,ih,jh)
   else
 
     ! Single processor, make sure the field is periodic
-    !$acc kernels default(present) if(is_present) async(1)
+    !$acc kernels default(present) if(is_present)
     a(sx-ih:sx-1,:,:) = a(ex-ih+1:ex,:,:)
     a(ex+1:ex+ih,:,:) = a(sx:sx+ih-1,:,:)
     !$acc end kernels

From a0652712566e48e3b351881ced46c4db78c684dc Mon Sep 17 00:00:00 2001
From: Caspar Jungbacker <caspar.jungbacker@outlook.com>
Date: Tue, 13 Jun 2023 15:30:59 +0200
Subject: [PATCH 18/19] Fix preprocessor stuff

---
 src/modmpi.f90 | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/modmpi.f90 b/src/modmpi.f90
index 986cdf0a6..74193329f 100644
--- a/src/modmpi.f90
+++ b/src/modmpi.f90
@@ -335,9 +335,9 @@ subroutine exitmpi
   end subroutine exitmpi
 
   subroutine excjs_real32(a,sx,ex,sy,ey,sz,ez,ih,jh)
-  #if defined(_OPENACC)
+#if defined(_OPENACC)
   use openacc
-  #endif
+#endif
   implicit none
   integer sx, ex, sy, ey, sz, ez, ih, jh
   real(real32) a(sx-ih:ex+ih, sy-jh:ey+jh, sz:ez)
@@ -351,10 +351,10 @@ subroutine excjs_real32(a,sx,ex,sy,ey,sz,ez,ih,jh)
                                           , sende,recve &
                                           , sendw,recvw
   ! Check if the data is on the gpu
-  #if defined(_OPENACC)
+#if defined(_OPENACC)
   integer :: is_present
   is_present = acc_is_present(a)
-  #endif
+#endif
 
 ! Calulate buffer lengths
   !$acc kernels default(present) if(is_present)
@@ -460,9 +460,9 @@ subroutine excjs_real32(a,sx,ex,sy,ey,sz,ez,ih,jh)
   end subroutine excjs_real32
 
   subroutine excjs_real64(a,sx,ex,sy,ey,sz,ez,ih,jh)
-  #if defined(_OPENACC)
+#if defined(_OPENACC)
   use openacc
-  #endif
+#endif
   implicit none
   integer sx, ex, sy, ey, sz, ez, ih, jh
   real(real64) a(sx-ih:ex+ih, sy-jh:ey+jh, sz:ez)
@@ -476,10 +476,10 @@ subroutine excjs_real64(a,sx,ex,sy,ey,sz,ez,ih,jh)
                                           , sende,recve &
                                           , sendw,recvw
   ! Check if the data is on the gpu
-  #if defined(_OPENACC)
+#if defined(_OPENACC)
   integer :: is_present
   is_present = acc_is_present(a)
-  #endif
+#endif
 
 ! Calulate buffer lengths
   !$acc kernels default(present) if(is_present)

From d41510a1eb4d6d0eebd86bac4c6608ea6c07a6de Mon Sep 17 00:00:00 2001
From: Caspar Jungbacker <caspar.jungbacker@outlook.com>
Date: Tue, 13 Jun 2023 15:36:47 +0200
Subject: [PATCH 19/19] Change data type

---
 src/modmpi.f90 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/modmpi.f90 b/src/modmpi.f90
index 74193329f..c4347a180 100644
--- a/src/modmpi.f90
+++ b/src/modmpi.f90
@@ -352,7 +352,7 @@ subroutine excjs_real32(a,sx,ex,sy,ey,sz,ez,ih,jh)
                                           , sendw,recvw
   ! Check if the data is on the gpu
 #if defined(_OPENACC)
-  integer :: is_present
+  logical :: is_present
   is_present = acc_is_present(a)
 #endif
 
@@ -477,7 +477,7 @@ subroutine excjs_real64(a,sx,ex,sy,ey,sz,ez,ih,jh)
                                           , sendw,recvw
   ! Check if the data is on the gpu
 #if defined(_OPENACC)
-  integer :: is_present
+  logical :: is_present
   is_present = acc_is_present(a)
 #endif