From 279e4c43d7039487be4cd268489c865b2d56a66f Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Tue, 12 Nov 2024 16:39:36 +0100
Subject: [PATCH 01/17] support for MappedFunctions

---
 kernel_maps.jl                                |  0
 ...ute_hydrostatic_free_surface_tendencies.jl | 29 ++-----
 src/Utils/kernel_launching.jl                 | 87 ++++++++++++++++---
 3 files changed, 81 insertions(+), 35 deletions(-)
 create mode 100644 kernel_maps.jl

diff --git a/kernel_maps.jl b/kernel_maps.jl
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/Models/HydrostaticFreeSurfaceModels/compute_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/compute_hydrostatic_free_surface_tendencies.jl
index 82c71b05dd..e61ef7dfa5 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/compute_hydrostatic_free_surface_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/compute_hydrostatic_free_surface_tendencies.jl
@@ -96,7 +96,6 @@ function compute_hydrostatic_free_surface_tendency_contributions!(model, kernel_
                 compute_hydrostatic_free_surface_Gc!,
                 c_tendency,
                 grid,
-                active_cells_map,
                 args;
                 active_cells_map)
     end
@@ -161,12 +160,12 @@ function compute_hydrostatic_momentum_tendencies!(model, velocities, kernel_para
 
     launch!(arch, grid, kernel_parameters,
             compute_hydrostatic_free_surface_Gu!, model.timestepper.Gⁿ.u, grid, 
-            active_cells_map, u_kernel_args;
+            u_kernel_args;
             active_cells_map)
 
     launch!(arch, grid, kernel_parameters,
             compute_hydrostatic_free_surface_Gv!, model.timestepper.Gⁿ.v, grid, 
-            active_cells_map, v_kernel_args;
+            v_kernel_args;
             active_cells_map)
 
     compute_free_surface_tendency!(grid, model, :xy)
@@ -200,45 +199,27 @@ end
 #####
 
 """ Calculate the right-hand-side of the u-velocity equation. """
-@kernel function compute_hydrostatic_free_surface_Gu!(Gu, grid, ::Nothing, args)
+@kernel function compute_hydrostatic_free_surface_Gu!(Gu, grid, args)
     i, j, k = @index(Global, NTuple)
     @inbounds Gu[i, j, k] = hydrostatic_free_surface_u_velocity_tendency(i, j, k, grid, args...)
 end
 
-@kernel function compute_hydrostatic_free_surface_Gu!(Gu, grid, active_cells_map, args)
-    idx = @index(Global, Linear)
-    i, j, k = active_linear_index_to_tuple(idx, active_cells_map)
-    @inbounds Gu[i, j, k] = hydrostatic_free_surface_u_velocity_tendency(i, j, k, grid, args...)
-end
-
 """ Calculate the right-hand-side of the v-velocity equation. """
-@kernel function compute_hydrostatic_free_surface_Gv!(Gv, grid, ::Nothing, args)
+@kernel function compute_hydrostatic_free_surface_Gv!(Gv, grid, args)
     i, j, k = @index(Global, NTuple)
     @inbounds Gv[i, j, k] = hydrostatic_free_surface_v_velocity_tendency(i, j, k, grid, args...)
 end
 
-@kernel function compute_hydrostatic_free_surface_Gv!(Gv, grid, active_cells_map, args)
-    idx = @index(Global, Linear)
-    i, j, k = active_linear_index_to_tuple(idx, active_cells_map)
-    @inbounds Gv[i, j, k] = hydrostatic_free_surface_v_velocity_tendency(i, j, k, grid, args...)
-end
-
 #####
 ##### Tendency calculators for tracers
 #####
 
 """ Calculate the right-hand-side of the tracer advection-diffusion equation. """
-@kernel function compute_hydrostatic_free_surface_Gc!(Gc, grid, ::Nothing, args)
+@kernel function compute_hydrostatic_free_surface_Gc!(Gc, grid, args)
     i, j, k = @index(Global, NTuple)
     @inbounds Gc[i, j, k] = hydrostatic_free_surface_tracer_tendency(i, j, k, grid, args...)
 end
 
-@kernel function compute_hydrostatic_free_surface_Gc!(Gc, grid, active_cells_map, args)
-    idx = @index(Global, Linear)
-    i, j, k = active_linear_index_to_tuple(idx, active_cells_map)
-    @inbounds Gc[i, j, k] = hydrostatic_free_surface_tracer_tendency(i, j, k, grid, args...)
-end
-
 #####
 ##### Tendency calculators for an explicit free surface
 #####
diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl
index 3625ae7632..6231e6be23 100644
--- a/src/Utils/kernel_launching.jl
+++ b/src/Utils/kernel_launching.jl
@@ -7,6 +7,7 @@ using Oceananigans.Architectures
 using Oceananigans.Grids
 using Oceananigans.Grids: AbstractGrid
 using Base: @pure
+using KernelAbstractions: Kernel
 
 import Oceananigans
 import KernelAbstractions: get, expand
@@ -80,6 +81,18 @@ end
 contiguousrange(range::NTuple{N, Int}, offset::NTuple{N, Int}) where N = Tuple(1+o:r+o for (r, o) in zip(range, offset))
 flatten_reduced_dimensions(worksize, dims) = Tuple(d ∈ dims ? 1 : worksize[d] for d = 1:3)
 
+#### 
+#### Internal utility to launch a function mapped on an index_map
+####
+
+struct MappedFunction{M} <: Function
+    f::Function
+    index_map::M
+end
+
+@inline (m::MappedFunction)(_ctx_)          = m.f(_ctx_)
+@inline (m::MappedFunction)(_ctx_, args...) = m.f(_ctx_, args...)
+
 # Support for 1D
 heuristic_workgroup(Wx) = min(Wx, 256)
 
@@ -238,9 +251,20 @@ the architecture `arch`.
 
     dev = Architectures.device(arch)
     loop = kernel!(dev, workgroup, worksize)
+
+    # Map out the function to use active_cells_map
+    # as an index map
+    if !isnothing(active_cells_map)
+        func  = MappedFunction(loop.f, active_cells_map)
+        param = get_kernel_parameters(loop)
+        M     = typeof(func)
+        loop  = Kernel{param..., M}(dev, func)
+    end
+
     return loop, worksize
 end
 
+@inline get_kernel_parameters(k::Kernel{A, B, C}) where {A, B, C} = A, B, C
        
 """
     launch!(arch, grid, workspec, kernel!, kernel_args...; kw...)
@@ -272,10 +296,7 @@ end
 @inline function _launch!(arch, grid, workspec, kernel!, first_kernel_arg, other_kernel_args...;
                           exclude_periphery = false,
                           reduced_dimensions = (),
-                          active_cells_map = nothing,
-                          # TODO: these two kwargs do nothing:
-                          only_local_halos = false,
-                          async = false)
+                          active_cells_map = nothing)
 
     location = Oceananigans.location(first_kernel_arg)
 
@@ -320,6 +341,13 @@ using KernelAbstractions: Kernel
 using KernelAbstractions.NDIteration: _Size, StaticSize
 using KernelAbstractions.NDIteration: NDRange
 
+using KernelAbstractions.NDIteration
+using KernelAbstractions: ndrange, workgroupsize
+import KernelAbstractions: partition
+
+using KernelAbstractions: CompilerMetadata
+import KernelAbstractions: __ndrange, __groupsize
+
 struct OffsetStaticSize{S} <: _Size
     function OffsetStaticSize{S}() where S
         new{S::Tuple{Vararg}}()
@@ -369,13 +397,6 @@ const OffsetNDRange{N} = NDRange{N, <:StaticSize, <:StaticSize, <:Any, <:KernelO
     return CartesianIndex(nI)
 end
 
-using KernelAbstractions.NDIteration
-using KernelAbstractions: ndrange, workgroupsize
-import KernelAbstractions: partition
-
-using KernelAbstractions: CompilerMetadata
-import KernelAbstractions: __ndrange, __groupsize
-
 @inline __ndrange(::CompilerMetadata{NDRange}) where {NDRange<:OffsetStaticSize}  = CartesianIndices(get(NDRange))
 @inline __groupsize(cm::CompilerMetadata{NDRange}) where {NDRange<:OffsetStaticSize} = size(__ndrange(cm))
 
@@ -413,3 +434,47 @@ function partition(kernel::OffsetKernel, inrange, ingroupsize)
     return iterspace, dynamic
 end
 
+#####
+##### Utilities for Mapped kernels
+#####
+
+struct IndexMap{M}
+    index_map :: M
+end
+
+const MappedNDRange{N} = NDRange{N, <:StaticSize, <:StaticSize, <:Any, <:IndexMap} where N
+
+# NDRange has been modified to have offsets in place of workitems: Remember, dynamic offset kernels are not possible with this extension!!
+# TODO: maybe don't do this
+@inline function expand(ndrange::MappedNDRange{N}, groupidx::CartesianIndex{N}, idx::CartesianIndex{N}) where {N}
+    nI = ntuple(Val(N)) do I
+        Base.@_inline_meta
+        offsets = workitems(ndrange)
+        stride = size(offsets, I)
+        gidx = groupidx.I[I]
+        @inbounds ndrange.workitems.index_map[(gidx - 1) * stride + idx.I[I]]
+    end
+    return CartesianIndex(nI...)
+end
+
+const MappedKernel = Kernel{<:Any, <:Any, <:Any, <:MappedFunction}
+
+# Extending the partition function to include offsets in NDRange: note that in this case the 
+# offsets take the place of the DynamicWorkitems which we assume is not needed in static kernels
+function partition(kernel::MappedKernel, inrange, ingroupsize)
+    static_workgroupsize = workgroupsize(kernel)
+    
+    # Calculate the static NDRange and WorkgroupSize
+    index_map = kernel.f.index_map
+    range = length(index_map)
+    groupsize = get(static_workgroupsize)
+
+    blocks, groupsize, dynamic = NDIteration.partition(range, groupsize)
+
+    static_blocks = StaticSize{blocks}
+    static_workgroupsize = StaticSize{groupsize} # we might have padded workgroupsize
+    
+    iterspace = NDRange{length(range), static_blocks, static_workgroupsize}(blocks, IndexMap(index_map))
+
+    return iterspace, dynamic
+end

From 6182e346a056739533e7ac7727a8cf704a48c6db Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Tue, 12 Nov 2024 16:49:01 +0100
Subject: [PATCH 02/17] remove double kernels everywhere

---
 .../hydrostatic_free_surface_ab2_step.jl      |  1 -
 .../split_explicit_free_surface_kernels.jl    | 21 +---------
 ...ore_hydrostatic_free_surface_tendencies.jl |  1 -
 .../compute_nonhydrostatic_tendencies.jl      | 39 ++++---------------
 4 files changed, 10 insertions(+), 52 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_ab2_step.jl b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_ab2_step.jl
index c76ac3b525..9cbb8e271f 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_ab2_step.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_ab2_step.jl
@@ -1,7 +1,6 @@
 using Oceananigans.Fields: location
 using Oceananigans.TimeSteppers: ab2_step_field!
 using Oceananigans.TurbulenceClosures: implicit_step!
-using Oceananigans.ImmersedBoundaries: retrieve_interior_active_cells_map, retrieve_surface_active_cells_map
 
 import Oceananigans.TimeSteppers: ab2_step!
 
diff --git a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
index 0042755123..786f99e129 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
@@ -133,7 +133,7 @@ end
 
 # Barotropic Model Kernels
 # u_Δz = u * Δz
-@kernel function _barotropic_mode_kernel!(U, V, grid, ::Nothing, u, v)
+@kernel function _barotropic_mode_kernel!(U, V, grid, u, v)
     i, j  = @index(Global, NTuple)
     k_top = grid.Nz+1
 
@@ -146,26 +146,9 @@ end
     end
 end
 
-# Barotropic Model Kernels
-# u_Δz = u * Δz
-@kernel function _barotropic_mode_kernel!(U, V, grid, active_cells_map, u, v)
-    idx = @index(Global, Linear)
-    i, j = active_linear_index_to_tuple(idx, active_cells_map)
-    k_top = grid.Nz+1
-
-    @inbounds U[i, j, k_top-1] = Δzᶠᶜᶜ(i, j, 1, grid) * u[i, j, 1]
-    @inbounds V[i, j, k_top-1] = Δzᶜᶠᶜ(i, j, 1, grid) * v[i, j, 1]
-
-    for k in 2:grid.Nz
-        @inbounds U[i, j, k_top-1] += Δzᶠᶜᶜ(i, j, k, grid) * u[i, j, k]
-        @inbounds V[i, j, k_top-1] += Δzᶜᶠᶜ(i, j, k, grid) * v[i, j, k]
-    end
-end
-
 @inline function compute_barotropic_mode!(U, V, grid, u, v)
     active_cells_map = retrieve_surface_active_cells_map(grid)
-
-    launch!(architecture(grid), grid, :xy, _barotropic_mode_kernel!, U, V, grid, active_cells_map, u, v; active_cells_map)
+    launch!(architecture(grid), grid, :xy, _barotropic_mode_kernel!, U, V, grid, u, v; active_cells_map)
 
     return nothing
 end
diff --git a/src/Models/HydrostaticFreeSurfaceModels/store_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/store_hydrostatic_free_surface_tendencies.jl
index a84d03e13f..089370fdd9 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/store_hydrostatic_free_surface_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/store_hydrostatic_free_surface_tendencies.jl
@@ -4,7 +4,6 @@ using Oceananigans.TimeSteppers: store_field_tendencies!
 
 using Oceananigans: prognostic_fields
 using Oceananigans.Grids: AbstractGrid
-using Oceananigans.ImmersedBoundaries: retrieve_interior_active_cells_map
 
 using Oceananigans.Utils: launch!
 
diff --git a/src/Models/NonhydrostaticModels/compute_nonhydrostatic_tendencies.jl b/src/Models/NonhydrostaticModels/compute_nonhydrostatic_tendencies.jl
index ffa7865bb2..67bcbbbb98 100644
--- a/src/Models/NonhydrostaticModels/compute_nonhydrostatic_tendencies.jl
+++ b/src/Models/NonhydrostaticModels/compute_nonhydrostatic_tendencies.jl
@@ -103,15 +103,15 @@ function compute_interior_tendency_contributions!(model, kernel_parameters; acti
 
     exclude_periphery = true
     launch!(arch, grid, kernel_parameters, compute_Gu!, 
-            tendencies.u, grid, active_cells_map, u_kernel_args;
+            tendencies.u, grid, u_kernel_args;
             active_cells_map, exclude_periphery)
 
     launch!(arch, grid, kernel_parameters, compute_Gv!, 
-            tendencies.v, grid, active_cells_map, v_kernel_args;
+            tendencies.v, grid, v_kernel_args;
             active_cells_map, exclude_periphery)
 
     launch!(arch, grid, kernel_parameters, compute_Gw!, 
-            tendencies.w, grid, active_cells_map, w_kernel_args;
+            tendencies.w, grid, w_kernel_args;
             active_cells_map, exclude_periphery)
 
     start_tracer_kernel_args = (advection, closure)
@@ -131,7 +131,7 @@ function compute_interior_tendency_contributions!(model, kernel_parameters; acti
                      forcing, clock)
 
         launch!(arch, grid, kernel_parameters, compute_Gc!, 
-                c_tendency, grid, active_cells_map, args;
+                c_tendency, grid, args;
                 active_cells_map)
     end
 
@@ -143,57 +143,34 @@ end
 #####
 
 """ Calculate the right-hand-side of the u-velocity equation. """
-@kernel function compute_Gu!(Gu, grid, ::Nothing, args) 
+@kernel function compute_Gu!(Gu, grid, args) 
     i, j, k = @index(Global, NTuple)
     @inbounds Gu[i, j, k] = u_velocity_tendency(i, j, k, grid, args...)
 end
 
-@kernel function compute_Gu!(Gu, grid, interior_map, args) 
-    idx = @index(Global, Linear)
-    i, j, k = active_linear_index_to_tuple(idx, interior_map)
-    @inbounds Gu[i, j, k] = u_velocity_tendency(i, j, k, grid, args...)
-end
-
 """ Calculate the right-hand-side of the v-velocity equation. """
-@kernel function compute_Gv!(Gv, grid, ::Nothing, args) 
+@kernel function compute_Gv!(Gv, grid, args) 
     i, j, k = @index(Global, NTuple)
     @inbounds Gv[i, j, k] = v_velocity_tendency(i, j, k, grid, args...)
 end
 
-@kernel function compute_Gv!(Gv, grid, interior_map, args) 
-    idx = @index(Global, Linear)
-    i, j, k = active_linear_index_to_tuple(idx, interior_map)
-    @inbounds Gv[i, j, k] = v_velocity_tendency(i, j, k, grid, args...)
-end
-
 """ Calculate the right-hand-side of the w-velocity equation. """
-@kernel function compute_Gw!(Gw, grid, ::Nothing, args) 
+@kernel function compute_Gw!(Gw, grid, args) 
     i, j, k = @index(Global, NTuple)
     @inbounds Gw[i, j, k] = w_velocity_tendency(i, j, k, grid, args...)
 end
 
-@kernel function compute_Gw!(Gw, grid, interior_map, args)
-    idx = @index(Global, Linear)
-    i, j, k = active_linear_index_to_tuple(idx, interior_map)
-    @inbounds Gw[i, j, k] = w_velocity_tendency(i, j, k, grid, args...)
-end
 
 #####
 ##### Tracer(s)
 #####
 
 """ Calculate the right-hand-side of the tracer advection-diffusion equation. """
-@kernel function compute_Gc!(Gc, grid, ::Nothing, args)
+@kernel function compute_Gc!(Gc, grid, args)
     i, j, k = @index(Global, NTuple)
     @inbounds Gc[i, j, k] = tracer_tendency(i, j, k, grid, args...)
 end
 
-@kernel function compute_Gc!(Gc, grid, interior_map, args) 
-    idx = @index(Global, Linear)
-    i, j, k = active_linear_index_to_tuple(idx, interior_map)
-    @inbounds Gc[i, j, k] = tracer_tendency(i, j, k, grid, args...)
-end
-
 #####
 ##### Boundary contributions to tendencies due to user-prescribed fluxes
 #####

From 5545605ae1631221fb88d7530e06bde8df6bc2f2 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Tue, 12 Nov 2024 16:55:18 +0100
Subject: [PATCH 03/17] make sure also GPU works

---
 src/Utils/kernel_launching.jl | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl
index 6231e6be23..c81a31369f 100644
--- a/src/Utils/kernel_launching.jl
+++ b/src/Utils/kernel_launching.jl
@@ -6,6 +6,7 @@ using Oceananigans: location
 using Oceananigans.Architectures
 using Oceananigans.Grids
 using Oceananigans.Grids: AbstractGrid
+using Adapt
 using Base: @pure
 using KernelAbstractions: Kernel
 
@@ -90,6 +91,9 @@ struct MappedFunction{M} <: Function
     index_map::M
 end
 
+Adapt.adapt_structure(to, m::MappedFunction) = 
+    MappedFunction(Adapt.adapt(to, m.f), Adapt.adapt(to, m.index_map))
+
 @inline (m::MappedFunction)(_ctx_)          = m.f(_ctx_)
 @inline (m::MappedFunction)(_ctx_, args...) = m.f(_ctx_, args...)
 

From e239dbed09bdf899e7bda469cafab9d3e0822154 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Tue, 12 Nov 2024 16:59:30 +0100
Subject: [PATCH 04/17] make sure there is no return in the function

---
 src/Utils/kernel_launching.jl | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl
index c81a31369f..7b1409b392 100644
--- a/src/Utils/kernel_launching.jl
+++ b/src/Utils/kernel_launching.jl
@@ -94,8 +94,13 @@ end
 Adapt.adapt_structure(to, m::MappedFunction) = 
     MappedFunction(Adapt.adapt(to, m.f), Adapt.adapt(to, m.index_map))
 
-@inline (m::MappedFunction)(_ctx_)          = m.f(_ctx_)
-@inline (m::MappedFunction)(_ctx_, args...) = m.f(_ctx_, args...)
+@inline function (m::MappedFunction)(_ctx_)  
+    m.f(_ctx_)
+end
+
+@inline function (m::MappedFunction)(_ctx_, args...) 
+    m.f(_ctx_, args...)
+end
 
 # Support for 1D
 heuristic_workgroup(Wx) = min(Wx, 256)

From 32e0c524eb617a75da7ff1841db4a3c0cdfa1f17 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Tue, 12 Nov 2024 17:00:15 +0100
Subject: [PATCH 05/17] return nothing

---
 src/Utils/kernel_launching.jl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl
index 7b1409b392..9084639507 100644
--- a/src/Utils/kernel_launching.jl
+++ b/src/Utils/kernel_launching.jl
@@ -96,10 +96,12 @@ Adapt.adapt_structure(to, m::MappedFunction) =
 
 @inline function (m::MappedFunction)(_ctx_)  
     m.f(_ctx_)
+    return nothing
 end
 
 @inline function (m::MappedFunction)(_ctx_, args...) 
     m.f(_ctx_, args...)
+    return nothing
 end
 
 # Support for 1D

From 0b56304b347c5961b920aaa50b50600b4bfd69ba Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Tue, 12 Nov 2024 17:26:44 +0100
Subject: [PATCH 06/17] adapt for GPU usage

---
 src/BoundaryConditions/fill_halo_regions.jl |  6 ++++-
 src/Utils/kernel_launching.jl               | 28 ++++++++++++---------
 2 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/src/BoundaryConditions/fill_halo_regions.jl b/src/BoundaryConditions/fill_halo_regions.jl
index fa380e7029..50ed69ef46 100644
--- a/src/BoundaryConditions/fill_halo_regions.jl
+++ b/src/BoundaryConditions/fill_halo_regions.jl
@@ -46,7 +46,11 @@ const MaybeTupledData = Union{OffsetArray, NTuple{<:Any, OffsetArray}}
 
 "Fill halo regions in ``x``, ``y``, and ``z`` for a given field's data."
 function fill_halo_regions!(c::MaybeTupledData, boundary_conditions, indices, loc, grid, args...; 
-                            fill_boundary_normal_velocities = true, kwargs...)
+                            fill_boundary_normal_velocities = true, 
+                            only_local_halos = false,  # Only valid for `DistributedGrids`, we throw it away here
+                            async = false,  # Only valid for `DistributedGrids`, we throw it away here
+                            kwargs...)
+                            
     arch = architecture(grid)
 
     if fill_boundary_normal_velocities
diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl
index 9084639507..9ca6fb5371 100644
--- a/src/Utils/kernel_launching.jl
+++ b/src/Utils/kernel_launching.jl
@@ -86,8 +86,8 @@ flatten_reduced_dimensions(worksize, dims) = Tuple(d ∈ dims ? 1 : worksize[d]
 #### Internal utility to launch a function mapped on an index_map
 ####
 
-struct MappedFunction{M} <: Function
-    f::Function
+struct MappedFunction{F, M} <: Function
+    f::F
     index_map::M
 end
 
@@ -453,19 +453,21 @@ struct IndexMap{M}
     index_map :: M
 end
 
+Adapt.adapt_structure(to, m::IndexMap) = IndexMap(Adapt.adapt(to, m.index_map))
+
 const MappedNDRange{N} = NDRange{N, <:StaticSize, <:StaticSize, <:Any, <:IndexMap} where N
 
-# NDRange has been modified to have offsets in place of workitems: Remember, dynamic offset kernels are not possible with this extension!!
+# NDRange has been modified to include an index_map in place of workitems: R
+# Remember, dynamic offset kernels are not possible with this extension!!
+# Also, mapped kernels work only with a 1D kernel and a 1D map, it is not possible to launch a ND kernel.
 # TODO: maybe don't do this
-@inline function expand(ndrange::MappedNDRange{N}, groupidx::CartesianIndex{N}, idx::CartesianIndex{N}) where {N}
-    nI = ntuple(Val(N)) do I
-        Base.@_inline_meta
-        offsets = workitems(ndrange)
-        stride = size(offsets, I)
-        gidx = groupidx.I[I]
-        @inbounds ndrange.workitems.index_map[(gidx - 1) * stride + idx.I[I]]
-    end
-    return CartesianIndex(nI...)
+@inline function expand(ndrange::MappedNDRange, groupidx::CartesianIndex{N}, idx::CartesianIndex{N})
+    Base.@_inline_meta
+    offsets = workitems(ndrange)
+    stride = size(offsets, I)
+    gidx = groupidx.I[I]
+    @inbounds ndrange.workitems.index_map[(gidx - 1) * stride + idx.I[I]]
+    return CartesianIndex(nI)
 end
 
 const MappedKernel = Kernel{<:Any, <:Any, <:Any, <:MappedFunction}
@@ -478,6 +480,7 @@ function partition(kernel::MappedKernel, inrange, ingroupsize)
     # Calculate the static NDRange and WorkgroupSize
     index_map = kernel.f.index_map
     range = length(index_map)
+    arch  = Oceananigans.Architectures.architecture(index_map)
     groupsize = get(static_workgroupsize)
 
     blocks, groupsize, dynamic = NDIteration.partition(range, groupsize)
@@ -485,6 +488,7 @@ function partition(kernel::MappedKernel, inrange, ingroupsize)
     static_blocks = StaticSize{blocks}
     static_workgroupsize = StaticSize{groupsize} # we might have padded workgroupsize
     
+    index_map = Oceananigans.Architectures.convert_args(arch, index_map)
     iterspace = NDRange{length(range), static_blocks, static_workgroupsize}(blocks, IndexMap(index_map))
 
     return iterspace, dynamic

From 052f56e5c272de60c9cbac2e94833541c88df320 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Tue, 12 Nov 2024 17:43:28 +0100
Subject: [PATCH 07/17] not sure about the GPU

---
 src/Utils/kernel_launching.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl
index 9ca6fb5371..41e4770dee 100644
--- a/src/Utils/kernel_launching.jl
+++ b/src/Utils/kernel_launching.jl
@@ -461,12 +461,12 @@ const MappedNDRange{N} = NDRange{N, <:StaticSize, <:StaticSize, <:Any, <:IndexMa
 # Remember, dynamic offset kernels are not possible with this extension!!
 # Also, mapped kernels work only with a 1D kernel and a 1D map, it is not possible to launch a ND kernel.
 # TODO: maybe don't do this
-@inline function expand(ndrange::MappedNDRange, groupidx::CartesianIndex{N}, idx::CartesianIndex{N})
+@inline function expand(ndrange::MappedNDRange, groupidx::CartesianIndex, idx::CartesianIndex) 
     Base.@_inline_meta
     offsets = workitems(ndrange)
-    stride = size(offsets, I)
-    gidx = groupidx.I[I]
-    @inbounds ndrange.workitems.index_map[(gidx - 1) * stride + idx.I[I]]
+    stride = size(offsets, 1)
+    gidx = groupidx.I[1]
+    @inbounds ndrange.workitems.index_map[(gidx - 1) * stride + idx.I[1]]
     return CartesianIndex(nI)
 end
 

From d0d56ac02dc785a81b01d2dd0bcf605f625f78f7 Mon Sep 17 00:00:00 2001
From: simone-silvestri <silvestri.simone0@gmail.com>
Date: Tue, 12 Nov 2024 12:47:31 -0500
Subject: [PATCH 08/17] hmmm

---
 src/Utils/kernel_launching.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl
index 41e4770dee..667f6ab1ea 100644
--- a/src/Utils/kernel_launching.jl
+++ b/src/Utils/kernel_launching.jl
@@ -466,7 +466,7 @@ const MappedNDRange{N} = NDRange{N, <:StaticSize, <:StaticSize, <:Any, <:IndexMa
     offsets = workitems(ndrange)
     stride = size(offsets, 1)
     gidx = groupidx.I[1]
-    @inbounds ndrange.workitems.index_map[(gidx - 1) * stride + idx.I[1]]
+    nI = @inbounds ndrange.workitems.index_map[(gidx - 1) * stride + idx.I[1]]
     return CartesianIndex(nI)
 end
 

From 7e41737b458c00e8767f31caba685d683a4c6fe0 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Tue, 12 Nov 2024 19:56:08 +0100
Subject: [PATCH 09/17] probably like this it will work on the GPU?

---
 src/Utils/kernel_launching.jl | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl
index 41e4770dee..65025f0b37 100644
--- a/src/Utils/kernel_launching.jl
+++ b/src/Utils/kernel_launching.jl
@@ -88,11 +88,11 @@ flatten_reduced_dimensions(worksize, dims) = Tuple(d ∈ dims ? 1 : worksize[d]
 
 struct MappedFunction{F, M} <: Function
     f::F
-    index_map::M
+    imap::M
 end
 
 Adapt.adapt_structure(to, m::MappedFunction) = 
-    MappedFunction(Adapt.adapt(to, m.f), Adapt.adapt(to, m.index_map))
+    MappedFunction(Adapt.adapt(to, m.f), Adapt.adapt(to, m.imap))
 
 @inline function (m::MappedFunction)(_ctx_)  
     m.f(_ctx_)
@@ -466,11 +466,18 @@ const MappedNDRange{N} = NDRange{N, <:StaticSize, <:StaticSize, <:Any, <:IndexMa
     offsets = workitems(ndrange)
     stride = size(offsets, 1)
     gidx = groupidx.I[1]
-    @inbounds ndrange.workitems.index_map[(gidx - 1) * stride + idx.I[1]]
+    tI = (gidx - 1) * stride + idx.I[1]
+    nI = ndrange.workitems.index_map[tI]
     return CartesianIndex(nI)
 end
 
-const MappedKernel = Kernel{<:Any, <:Any, <:Any, <:MappedFunction}
+const MappedKernel{D} = Kernel{D, <:Any, <:Any, <:MappedFunction} where D
+
+# Override the getproperty to make sure we get the correct properties
+@inline getproperty(k::MappedKernel, prop::Symbol) = get_mapped_property(k, Val(prop))
+
+@inline get_mapped_property(k, ::Val{:imap}) = k.f.imap
+@inline get_mapped_property(k, ::Val{:f}) = k.f.f
 
 # Extending the partition function to include offsets in NDRange: note that in this case the 
 # offsets take the place of the DynamicWorkitems which we assume is not needed in static kernels
@@ -478,7 +485,7 @@ function partition(kernel::MappedKernel, inrange, ingroupsize)
     static_workgroupsize = workgroupsize(kernel)
     
     # Calculate the static NDRange and WorkgroupSize
-    index_map = kernel.f.index_map
+    index_map = getproperty(kernel, :imap)
     range = length(index_map)
     arch  = Oceananigans.Architectures.architecture(index_map)
     groupsize = get(static_workgroupsize)
@@ -492,4 +499,4 @@ function partition(kernel::MappedKernel, inrange, ingroupsize)
     iterspace = NDRange{length(range), static_blocks, static_workgroupsize}(blocks, IndexMap(index_map))
 
     return iterspace, dynamic
-end
+end
\ No newline at end of file

From 130a3571732bbe3d28de5dc388591ec71352efe8 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Tue, 12 Nov 2024 21:35:57 +0100
Subject: [PATCH 10/17] some cleanup plus testing a test?

---
 src/Utils/kernel_launching.jl | 39 ++++++++++++++---------------------
 test.jl                       | 18 ++++++++++++++++
 2 files changed, 33 insertions(+), 24 deletions(-)
 create mode 100644 test.jl

diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl
index b158fe09a4..436cde1037 100644
--- a/src/Utils/kernel_launching.jl
+++ b/src/Utils/kernel_launching.jl
@@ -87,20 +87,15 @@ flatten_reduced_dimensions(worksize, dims) = Tuple(d ∈ dims ? 1 : worksize[d]
 ####
 
 struct MappedFunction{F, M} <: Function
-    f::F
-    imap::M
+    func::F
+    index_map::M
 end
 
 Adapt.adapt_structure(to, m::MappedFunction) = 
-    MappedFunction(Adapt.adapt(to, m.f), Adapt.adapt(to, m.imap))
-
-@inline function (m::MappedFunction)(_ctx_)  
-    m.f(_ctx_)
-    return nothing
-end
+    MappedFunction(Adapt.adapt(to, m.func), Adapt.adapt(to, m.index_map))
 
 @inline function (m::MappedFunction)(_ctx_, args...) 
-    m.f(_ctx_, args...)
+    m.func(_ctx_, args...)
     return nothing
 end
 
@@ -251,6 +246,7 @@ the architecture `arch`.
                                   location = nothing,
                                   active_cells_map = nothing)
 
+
     if !isnothing(active_cells_map) # everything else is irrelevant
         workgroup = min(length(active_cells_map), 256)
         worksize = length(active_cells_map)
@@ -263,19 +259,16 @@ the architecture `arch`.
     dev = Architectures.device(arch)
     loop = kernel!(dev, workgroup, worksize)
 
-    # Map out the function to use active_cells_map
-    # as an index map
+    # Map out the function to use active_cells_map as an index map
     if !isnothing(active_cells_map)
-        func  = MappedFunction(loop.f, active_cells_map)
-        param = get_kernel_parameters(loop)
-        M     = typeof(func)
-        loop  = Kernel{param..., M}(dev, func)
+        func = MappedFunction(loop.f, active_cells_map)
+        loop = Kernel{get_kernel_parameters(loop)..., typeof(func)}(dev, func)
     end
 
     return loop, worksize
 end
 
-@inline get_kernel_parameters(k::Kernel{A, B, C}) where {A, B, C} = A, B, C
+@inline get_kernel_parameters(::Kernel{Dev, B, W}) where {Dev, B, W} = Dev, B, W
        
 """
     launch!(arch, grid, workspec, kernel!, kernel_args...; kw...)
@@ -347,8 +340,6 @@ end
 #####
 
 # TODO: when offsets are implemented in KA so that we can call `kernel(dev, group, size, offsets)`, remove all of this
-
-using KernelAbstractions: Kernel
 using KernelAbstractions.NDIteration: _Size, StaticSize
 using KernelAbstractions.NDIteration: NDRange
 
@@ -450,14 +441,14 @@ end
 #####
 
 struct IndexMap{M}
-    imap :: M
+    index_map :: M
 end
 
 Adapt.adapt_structure(to, m::IndexMap) = IndexMap(Adapt.adapt(to, m.index_map))
 
 const MappedNDRange{N} = NDRange{N, <:StaticSize, <:StaticSize, <:Any, <:IndexMap} where N
 
-# NDRange has been modified to include an index_map in place of workitems: R
+# NDRange has been modified to include an index_map in place of workitems.
 # Remember, dynamic offset kernels are not possible with this extension!!
 # Also, mapped kernels work only with a 1D kernel and a 1D map, it is not possible to launch a ND kernel.
 # TODO: maybe don't do this
@@ -467,7 +458,7 @@ const MappedNDRange{N} = NDRange{N, <:StaticSize, <:StaticSize, <:Any, <:IndexMa
     stride = size(offsets, 1)
     gidx = groupidx.I[1]
     tI = (gidx - 1) * stride + idx.I[1]
-    nI = ndrange.workitems.imap[tI]
+    nI = ndrange.workitems.index_map[tI]
     return CartesianIndex(nI)
 end
 
@@ -476,8 +467,8 @@ const MappedKernel{D} = Kernel{D, <:Any, <:Any, <:MappedFunction} where D
 # Override the getproperty to make sure we get the correct properties
 @inline getproperty(k::MappedKernel, prop::Symbol) = get_mapped_property(k, Val(prop))
 
-@inline get_mapped_property(k, ::Val{:imap}) = k.f.imap
-@inline get_mapped_property(k, ::Val{:f}) = k.f.f
+@inline get_mapped_property(k, ::Val{:index_map}) = k.f.index_map
+@inline get_mapped_property(k, ::Val{:func}) = k.f.func
 
 # Extending the partition function to include offsets in NDRange: note that in this case the 
 # offsets take the place of the DynamicWorkitems which we assume is not needed in static kernels
@@ -485,7 +476,7 @@ function partition(kernel::MappedKernel, inrange, ingroupsize)
     static_workgroupsize = workgroupsize(kernel)
     
     # Calculate the static NDRange and WorkgroupSize
-    index_map = getproperty(kernel, :imap)
+    index_map = getproperty(kernel, :index_map)
     range = length(index_map)
     arch  = Oceananigans.Architectures.architecture(index_map)
     groupsize = get(static_workgroupsize)
diff --git a/test.jl b/test.jl
new file mode 100644
index 0000000000..70239fa292
--- /dev/null
+++ b/test.jl
@@ -0,0 +1,18 @@
+using Revise
+using Oceananigans
+using Oceananigans.Utils
+
+using KernelAbstractions: @index, @kernel
+
+arch = CPU()
+
+@kernel function _test_indices(a)
+    i, j, k = @index(Global, NTuple)
+    a[i, j, k] = i + j + k
+end
+
+grid  = RectilinearGrid(arch, size = (3, 3, 3), extent = (1, 1, 1))
+array = zeros(arch, 3, 3, 3)
+imap  = on_architecture(arch, [(i, j, k) for i in 1:3, j in 1:3, k in 1:2])
+
+launch!(arch, grid_cpu, :xyz, _test_indices, array; active_cells_map = imap)
\ No newline at end of file

From 3be13dd0860740de74fa91b5df2532a623df1ed9 Mon Sep 17 00:00:00 2001
From: simone-silvestri <silvestri.simone0@gmail.com>
Date: Tue, 12 Nov 2024 18:40:50 -0500
Subject: [PATCH 11/17] works on gpu, to fix the dynamic check

---
 src/Utils/kernel_launching.jl | 91 +++++++++++++++++++++--------------
 1 file changed, 56 insertions(+), 35 deletions(-)

diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl
index 436cde1037..02473f0d5b 100644
--- a/src/Utils/kernel_launching.jl
+++ b/src/Utils/kernel_launching.jl
@@ -12,6 +12,7 @@ using KernelAbstractions: Kernel
 
 import Oceananigans
 import KernelAbstractions: get, expand
+import Base
 
 struct KernelParameters{S, O} end
 
@@ -82,23 +83,12 @@ end
 contiguousrange(range::NTuple{N, Int}, offset::NTuple{N, Int}) where N = Tuple(1+o:r+o for (r, o) in zip(range, offset))
 flatten_reduced_dimensions(worksize, dims) = Tuple(d ∈ dims ? 1 : worksize[d] for d = 1:3)
 
-#### 
-#### Internal utility to launch a function mapped on an index_map
-####
-
+# Internal utility to launch a function mapped on an index_map
 struct MappedFunction{F, M} <: Function
     func::F
     index_map::M
 end
 
-Adapt.adapt_structure(to, m::MappedFunction) = 
-    MappedFunction(Adapt.adapt(to, m.func), Adapt.adapt(to, m.index_map))
-
-@inline function (m::MappedFunction)(_ctx_, args...) 
-    m.func(_ctx_, args...)
-    return nothing
-end
-
 # Support for 1D
 heuristic_workgroup(Wx) = min(Wx, 256)
 
@@ -440,35 +430,41 @@ end
 ##### Utilities for Mapped kernels
 #####
 
-struct IndexMap{M}
-    index_map :: M
-end
-
-Adapt.adapt_structure(to, m::IndexMap) = IndexMap(Adapt.adapt(to, m.index_map))
+struct IndexMap end
 
-const MappedNDRange{N} = NDRange{N, <:StaticSize, <:StaticSize, <:Any, <:IndexMap} where N
+const MappedNDRange{N} = NDRange{N, <:StaticSize, <:StaticSize, <:IndexMap, <:AbstractArray} where N
 
 # NDRange has been modified to include an index_map in place of workitems.
 # Remember, dynamic offset kernels are not possible with this extension!!
 # Also, mapped kernels work only with a 1D kernel and a 1D map, it is not possible to launch a ND kernel.
 # TODO: maybe don't do this
-@inline function expand(ndrange::MappedNDRange, groupidx::CartesianIndex, idx::CartesianIndex) 
-    Base.@_inline_meta
-    offsets = workitems(ndrange)
-    stride = size(offsets, 1)
-    gidx = groupidx.I[1]
-    tI = (gidx - 1) * stride + idx.I[1]
-    nI = ndrange.workitems.index_map[tI]
-    return CartesianIndex(nI)
+@inline function expand(ndrange::MappedNDRange, groupidx::CartesianIndex{N}, idx::CartesianIndex{N}) where N
+    nI = ntuple(Val(N)) do I
+        Base.@_inline_meta
+        offsets = workitems(ndrange)
+        stride = size(offsets, I)
+        gidx = groupidx.I[I]
+        ndrange.workitems[(gidx - 1) * stride + idx.I[I]]
+    end
+    return CartesianIndex(nI...)
 end
 
 const MappedKernel{D} = Kernel{D, <:Any, <:Any, <:MappedFunction} where D
 
-# Override the getproperty to make sure we get the correct properties
-@inline getproperty(k::MappedKernel, prop::Symbol) = get_mapped_property(k, Val(prop))
+# Override the getproperty to make sure we launch the correct function in the kernel
+@inline Base.getproperty(k::MappedKernel, prop::Symbol) = get_mapped_kernel_property(k, Val(prop))
+
+@inline get_mapped_kernel_property(k, ::Val{prop}) where prop = getfield(k, prop)
+@inline get_mapped_kernel_property(k, ::Val{:index_map}) = getfield(getfield(k, :f), :index_map)
+@inline get_mapped_kernel_property(k, ::Val{:f})         = getfield(getfield(k, :f), :func)
 
-@inline get_mapped_property(k, ::Val{:index_map}) = k.f.index_map
-@inline get_mapped_property(k, ::Val{:func}) = k.f.func
+Adapt.adapt_structure(to, cm::CompilerMetadata{N, C}) where {N, C} = 
+    CompilerMetadata{N, C}(Adapt.adapt(to, cm.groupindex), 
+                           Adapt.adapt(to, cm.ndrange),
+                           Adapt.adapt(to, cm.iterspace))
+
+Adapt.adapt_structure(to, ndrange::NDRange{N, B, W}) where {N, B, W} = 
+    NDRange{N, B, W}(Adapt.adapt(to, ndrange.blocks), Adapt.adapt(to, ndrange.workitems))
 
 # Extending the partition function to include offsets in NDRange: note that in this case the 
 # offsets take the place of the DynamicWorkitems which we assume is not needed in static kernels
@@ -476,9 +472,8 @@ function partition(kernel::MappedKernel, inrange, ingroupsize)
     static_workgroupsize = workgroupsize(kernel)
     
     # Calculate the static NDRange and WorkgroupSize
-    index_map = getproperty(kernel, :index_map)
+    index_map = kernel.index_map
     range = length(index_map)
-    arch  = Oceananigans.Architectures.architecture(index_map)
     groupsize = get(static_workgroupsize)
 
     blocks, groupsize, dynamic = NDIteration.partition(range, groupsize)
@@ -486,8 +481,34 @@ function partition(kernel::MappedKernel, inrange, ingroupsize)
     static_blocks = StaticSize{blocks}
     static_workgroupsize = StaticSize{groupsize} # we might have padded workgroupsize
     
-    index_map = Oceananigans.Architectures.convert_args(arch, index_map)
-    iterspace = NDRange{length(range), static_blocks, static_workgroupsize}(blocks, IndexMap(index_map))
+    iterspace = NDRange{length(range), static_blocks, static_workgroupsize}(IndexMap(), index_map)
 
     return iterspace, dynamic
-end
\ No newline at end of file
+end
+
+using KernelAbstractions: CompilerMetadata, NoDynamicCheck
+using CUDA: CUDABackend
+
+import KernelAbstractions: mkcontext, __dynamic_checkbounds, __validindex
+
+# Very dangerous override of mkcontext which will not work if we are not 
+# carefull with making sure that indices are correct when launching a `MappedKernel`
+# TODO: Definitely change this with options below
+function mkcontext(kernel::Kernel{CUDABackend}, _ndrange, iterspace::MappedNDRange)
+    return CompilerMetadata{ndrange(kernel), NoDynamicCheck}(_ndrange, iterspace)
+end
+
+# Alternative to the above to fix:
+# const MappedCompilerMetadata = CompilerMetadata{<:Any, <:Any, <:Any, <:Any, <:MappedNDRange}
+
+# @inline __ndrange(cm::MappedCompilerMetadata) = cm.iterspace
+
+# @inline function __validindex(ctx::MappedCompilerMetadata, idx::CartesianIndex)
+#     # Turns this into a noop for code where we can turn of checkbounds of
+#     if __dynamic_checkbounds(ctx)
+#         I = @inbounds expand(__iterspace(ctx), __groupindex(ctx), idx)
+#         return I in __ndrange(ctx)
+#     else
+#         return true
+#     end
+# end
\ No newline at end of file

From 64cad183deccf3fd500422ec7868c5551904eb24 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Wed, 13 Nov 2024 08:36:26 +0100
Subject: [PATCH 12/17] move lwargs

---
 src/BoundaryConditions/fill_halo_regions.jl | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/BoundaryConditions/fill_halo_regions.jl b/src/BoundaryConditions/fill_halo_regions.jl
index 50ed69ef46..c36c9c3fbf 100644
--- a/src/BoundaryConditions/fill_halo_regions.jl
+++ b/src/BoundaryConditions/fill_halo_regions.jl
@@ -47,8 +47,6 @@ const MaybeTupledData = Union{OffsetArray, NTuple{<:Any, OffsetArray}}
 "Fill halo regions in ``x``, ``y``, and ``z`` for a given field's data."
 function fill_halo_regions!(c::MaybeTupledData, boundary_conditions, indices, loc, grid, args...; 
                             fill_boundary_normal_velocities = true, 
-                            only_local_halos = false,  # Only valid for `DistributedGrids`, we throw it away here
-                            async = false,  # Only valid for `DistributedGrids`, we throw it away here
                             kwargs...)
                             
     arch = architecture(grid)
@@ -68,7 +66,10 @@ function fill_halo_regions!(c::MaybeTupledData, boundary_conditions, indices, lo
     return nothing
 end
 
-function fill_halo_event!(c, fill_halos!, bcs, indices, loc, arch, grid, args...; kwargs...)
+function fill_halo_event!(c, fill_halos!, bcs, indices, loc, arch, grid, args...; 
+                          only_local_halos = false,  # Only valid for `DistributedGrids`, we throw it away here
+                          async = false,  # Only valid for `DistributedGrids`, we throw it away here
+                          kwargs...)
 
     # Calculate size and offset of the fill_halo kernel
     # We assume that the kernel size is the same for west and east boundaries, 

From 333bfd707056483e19f99c4fca1ea28a8a78f9e2 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Wed, 13 Nov 2024 09:12:08 +0100
Subject: [PATCH 13/17] kwarg management

---
 src/BoundaryConditions/fill_halo_regions.jl | 28 +++++++++++++--------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/src/BoundaryConditions/fill_halo_regions.jl b/src/BoundaryConditions/fill_halo_regions.jl
index c36c9c3fbf..686f91fd6f 100644
--- a/src/BoundaryConditions/fill_halo_regions.jl
+++ b/src/BoundaryConditions/fill_halo_regions.jl
@@ -67,7 +67,6 @@ function fill_halo_regions!(c::MaybeTupledData, boundary_conditions, indices, lo
 end
 
 function fill_halo_event!(c, fill_halos!, bcs, indices, loc, arch, grid, args...; 
-                          only_local_halos = false,  # Only valid for `DistributedGrids`, we throw it away here
                           async = false,  # Only valid for `DistributedGrids`, we throw it away here
                           kwargs...)
 
@@ -300,27 +299,27 @@ end
 ##### Kernel launchers for single-sided fill_halos
 #####
 
-fill_west_halo!(c, bc, size, offset, loc, arch, grid, args...; kwargs...) = 
+fill_west_halo!(c, bc, size, offset, loc, arch, grid, args...; only_local_halos = false, kwargs...) = 
     launch!(arch, grid, KernelParameters(size, offset),
             _fill_only_west_halo!, c, bc, loc, grid, Tuple(args); kwargs...)
 
-fill_east_halo!(c, bc, size, offset, loc, arch, grid, args...; kwargs...) = 
+fill_east_halo!(c, bc, size, offset, loc, arch, grid, args...; only_local_halos = false, kwargs...) = 
     launch!(arch, grid, KernelParameters(size, offset),
             _fill_only_east_halo!, c, bc, loc, grid, Tuple(args); kwargs...)
 
-fill_south_halo!(c, bc, size, offset, loc, arch, grid, args...; kwargs...) = 
+fill_south_halo!(c, bc, size, offset, loc, arch, grid, args...; only_local_halos = false, kwargs...) = 
     launch!(arch, grid, KernelParameters(size, offset),
             _fill_only_south_halo!, c, bc, loc, grid, Tuple(args); kwargs...)
 
-fill_north_halo!(c, bc, size, offset, loc, arch, grid, args...; kwargs...) = 
+fill_north_halo!(c, bc, size, offset, loc, arch, grid, args...; only_local_halos = false, kwargs...) = 
     launch!(arch, grid, KernelParameters(size, offset),
             _fill_only_north_halo!, c, bc, loc, grid, Tuple(args); kwargs...)
 
-fill_bottom_halo!(c, bc, size, offset, loc, arch, grid, args...; kwargs...) = 
+fill_bottom_halo!(c, bc, size, offset, loc, arch, grid, args...; only_local_halos = false, kwargs...) = 
     launch!(arch, grid, KernelParameters(size, offset),
             _fill_only_bottom_halo!, c, bc, loc, grid, Tuple(args); kwargs...)
 
-fill_top_halo!(c, bc, size, offset, loc, arch, grid, args...; kwargs...) = 
+fill_top_halo!(c, bc, size, offset, loc, arch, grid, args...; only_local_halos = false, kwargs...) = 
     launch!(arch, grid, KernelParameters(size, offset),
             _fill_only_top_halo!, c, bc, loc, grid, Tuple(args); kwargs...)
 
@@ -328,17 +327,26 @@ fill_top_halo!(c, bc, size, offset, loc, arch, grid, args...; kwargs...) =
 ##### Kernel launchers for double-sided fill_halos
 #####
 
-function fill_west_and_east_halo!(c, west_bc, east_bc, size, offset, loc, arch, grid, args...; kwargs...)
+function fill_west_and_east_halo!(c, west_bc, east_bc, size, offset, loc, arch, grid, args...; 
+                                  only_local_halos = false,  # Only valid for `DistributedGrids`, we throw it away here
+                                  kwargs...)
+
     return launch!(arch, grid, KernelParameters(size, offset),
                    _fill_west_and_east_halo!, c, west_bc, east_bc, loc, grid, Tuple(args); kwargs...)
 end
 
-function fill_south_and_north_halo!(c, south_bc, north_bc, size, offset, loc, arch, grid, args...; kwargs...)
+function fill_south_and_north_halo!(c, south_bc, north_bc, size, offset, loc, arch, grid, args...; 
+                                    only_local_halos = false,  # Only valid for `DistributedGrids`, we throw it away here
+                                    kwargs...)
+
     return launch!(arch, grid, KernelParameters(size, offset),
                    _fill_south_and_north_halo!, c, south_bc, north_bc, loc, grid, Tuple(args); kwargs...)
 end
 
-function fill_bottom_and_top_halo!(c, bottom_bc, top_bc, size, offset, loc, arch, grid, args...; kwargs...)
+function fill_bottom_and_top_halo!(c, bottom_bc, top_bc, size, offset, loc, arch, grid, args...; 
+                                  only_local_halos = false,  # Only valid for `DistributedGrids`, we throw it away here
+                                  kwargs...)
+
     return launch!(arch, grid, KernelParameters(size, offset),
                    _fill_bottom_and_top_halo!, c, bottom_bc, top_bc, loc, grid, Tuple(args); kwargs...)
 end

From e069051e1926705e620abd8ab93b2b2bcd068a44 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Wed, 13 Nov 2024 09:33:39 +0100
Subject: [PATCH 14/17] reduce time for tridiagonal solve

---
 src/Solvers/batched_tridiagonal_solver.jl | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/src/Solvers/batched_tridiagonal_solver.jl b/src/Solvers/batched_tridiagonal_solver.jl
index 2886847938..4e9aca12f3 100644
--- a/src/Solvers/batched_tridiagonal_solver.jl
+++ b/src/Solvers/batched_tridiagonal_solver.jl
@@ -99,13 +99,16 @@ Press William, H., Teukolsky Saul, A., Vetterling William, T., & Flannery Brian,
 """
 function solve!(ϕ, solver::BatchedTridiagonalSolver, rhs, args...)
 
-    launch_config = if solver.tridiagonal_direction isa XDirection
-                        :yz
-                    elseif solver.tridiagonal_direction isa YDirection
-                        :xz
-                    elseif solver.tridiagonal_direction isa ZDirection
-                        :xy
-                    end
+    if solver.tridiagonal_direction isa XDirection
+        launch_config = :yz
+        active_cells_map = nothing
+    elseif solver.tridiagonal_direction isa YDirection
+        launch_config = :xz
+        active_cells_map = nothing
+    elseif solver.tridiagonal_direction isa ZDirection
+        launch_config = :xy
+        active_cells_map = retrieve_surface_active_cells_map(grid)
+    end
 
     launch!(architecture(solver), solver.grid, launch_config,
             solve_batched_tridiagonal_system_kernel!, ϕ,
@@ -117,7 +120,8 @@ function solve!(ϕ, solver::BatchedTridiagonalSolver, rhs, args...)
             solver.grid,
             solver.parameters,
             Tuple(args),
-            solver.tridiagonal_direction)
+            solver.tridiagonal_direction;
+            active_cells_map)
 
     return nothing
 end

From f0a887a4094b31f4121d6269198988f8dc48d3fc Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Wed, 13 Nov 2024 13:06:24 +0100
Subject: [PATCH 15/17] see if tests pass (especially distributed where we have
 active_cells)

---
 src/BoundaryConditions/fill_halo_regions_periodic.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/BoundaryConditions/fill_halo_regions_periodic.jl b/src/BoundaryConditions/fill_halo_regions_periodic.jl
index 890b9c2fba..edd97528fa 100644
--- a/src/BoundaryConditions/fill_halo_regions_periodic.jl
+++ b/src/BoundaryConditions/fill_halo_regions_periodic.jl
@@ -15,19 +15,19 @@ end
 
 @inline fix_halo_offsets(o, co) = co > 0 ? o - co : o # Windowed fields have only positive offsets to correct
 
-function fill_west_and_east_halo!(c, ::PBCT, ::PBCT, size, offset, loc, arch, grid, args...; kw...)
+function fill_west_and_east_halo!(c, ::PBCT, ::PBCT, size, offset, loc, arch, grid, args...; only_local_halos = false, kw...)
     c_parent, yz_size, offset = parent_size_and_offset(c, 2, 3, size, offset)
     launch!(arch, grid, KernelParameters(yz_size, offset), fill_periodic_west_and_east_halo!, c_parent, Val(grid.Hx), grid.Nx; kw...)
     return nothing
 end
 
-function fill_south_and_north_halo!(c, ::PBCT, ::PBCT, size, offset, loc, arch, grid, args...; kw...)
+function fill_south_and_north_halo!(c, ::PBCT, ::PBCT, size, offset, loc, arch, grid, args...; only_local_halos = false, kw...)
     c_parent, xz_size, offset = parent_size_and_offset(c, 1, 3, size, offset)
     launch!(arch, grid, KernelParameters(xz_size, offset), fill_periodic_south_and_north_halo!, c_parent, Val(grid.Hy), grid.Ny;  kw...)
     return nothing
 end
 
-function fill_bottom_and_top_halo!(c, ::PBCT, ::PBCT, size, offset, loc, arch, grid, args...; kw...)
+function fill_bottom_and_top_halo!(c, ::PBCT, ::PBCT, size, offset, loc, arch, grid, args...; only_local_halos = false, kw...)
     c_parent, xy_size, offset = parent_size_and_offset(c, 1, 2, size, offset)
     launch!(arch, grid, KernelParameters(xy_size, offset), fill_periodic_bottom_and_top_halo!, c_parent, Val(grid.Hz), grid.Nz; kw...)
     return nothing

From 7b33e7a417b9ecaa54497221f35c651c549eabcf Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Wed, 13 Nov 2024 13:52:05 +0100
Subject: [PATCH 16/17] bugfix

---
 src/Solvers/batched_tridiagonal_solver.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/Solvers/batched_tridiagonal_solver.jl b/src/Solvers/batched_tridiagonal_solver.jl
index 4e9aca12f3..83cb5b2236 100644
--- a/src/Solvers/batched_tridiagonal_solver.jl
+++ b/src/Solvers/batched_tridiagonal_solver.jl
@@ -1,5 +1,6 @@
 using Oceananigans.Architectures: on_architecture
 using Oceananigans.Grids: XDirection, YDirection, ZDirection
+using Oceananigans.ImmersedBoundaries: retrieve_surface_active_cells_map
 
 import Oceananigans.Architectures: architecture
 

From 26194f320df8e03a02bc4bca0f03a7134a271ff0 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Wed, 13 Nov 2024 14:45:17 +0100
Subject: [PATCH 17/17] grid from solver

---
 src/Solvers/batched_tridiagonal_solver.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Solvers/batched_tridiagonal_solver.jl b/src/Solvers/batched_tridiagonal_solver.jl
index 83cb5b2236..4f756ca3ac 100644
--- a/src/Solvers/batched_tridiagonal_solver.jl
+++ b/src/Solvers/batched_tridiagonal_solver.jl
@@ -108,7 +108,7 @@ function solve!(ϕ, solver::BatchedTridiagonalSolver, rhs, args...)
         active_cells_map = nothing
     elseif solver.tridiagonal_direction isa ZDirection
         launch_config = :xy
-        active_cells_map = retrieve_surface_active_cells_map(grid)
+        active_cells_map = retrieve_surface_active_cells_map(solver.grid)
     end
 
     launch!(architecture(solver), solver.grid, launch_config,