From 279e4c43d7039487be4cd268489c865b2d56a66f Mon Sep 17 00:00:00 2001 From: Simone Silvestri Date: Tue, 12 Nov 2024 16:39:36 +0100 Subject: [PATCH 01/17] support for MappedFunctions --- kernel_maps.jl | 0 ...ute_hydrostatic_free_surface_tendencies.jl | 29 ++----- src/Utils/kernel_launching.jl | 87 ++++++++++++++++--- 3 files changed, 81 insertions(+), 35 deletions(-) create mode 100644 kernel_maps.jl diff --git a/kernel_maps.jl b/kernel_maps.jl new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/Models/HydrostaticFreeSurfaceModels/compute_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/compute_hydrostatic_free_surface_tendencies.jl index 82c71b05dd..e61ef7dfa5 100644 --- a/src/Models/HydrostaticFreeSurfaceModels/compute_hydrostatic_free_surface_tendencies.jl +++ b/src/Models/HydrostaticFreeSurfaceModels/compute_hydrostatic_free_surface_tendencies.jl @@ -96,7 +96,6 @@ function compute_hydrostatic_free_surface_tendency_contributions!(model, kernel_ compute_hydrostatic_free_surface_Gc!, c_tendency, grid, - active_cells_map, args; active_cells_map) end @@ -161,12 +160,12 @@ function compute_hydrostatic_momentum_tendencies!(model, velocities, kernel_para launch!(arch, grid, kernel_parameters, compute_hydrostatic_free_surface_Gu!, model.timestepper.Gⁿ.u, grid, - active_cells_map, u_kernel_args; + u_kernel_args; active_cells_map) launch!(arch, grid, kernel_parameters, compute_hydrostatic_free_surface_Gv!, model.timestepper.Gⁿ.v, grid, - active_cells_map, v_kernel_args; + v_kernel_args; active_cells_map) compute_free_surface_tendency!(grid, model, :xy) @@ -200,45 +199,27 @@ end ##### """ Calculate the right-hand-side of the u-velocity equation. """ -@kernel function compute_hydrostatic_free_surface_Gu!(Gu, grid, ::Nothing, args) +@kernel function compute_hydrostatic_free_surface_Gu!(Gu, grid, args) i, j, k = @index(Global, NTuple) @inbounds Gu[i, j, k] = hydrostatic_free_surface_u_velocity_tendency(i, j, k, grid, args...) end -@kernel function compute_hydrostatic_free_surface_Gu!(Gu, grid, active_cells_map, args) - idx = @index(Global, Linear) - i, j, k = active_linear_index_to_tuple(idx, active_cells_map) - @inbounds Gu[i, j, k] = hydrostatic_free_surface_u_velocity_tendency(i, j, k, grid, args...) -end - """ Calculate the right-hand-side of the v-velocity equation. """ -@kernel function compute_hydrostatic_free_surface_Gv!(Gv, grid, ::Nothing, args) +@kernel function compute_hydrostatic_free_surface_Gv!(Gv, grid, args) i, j, k = @index(Global, NTuple) @inbounds Gv[i, j, k] = hydrostatic_free_surface_v_velocity_tendency(i, j, k, grid, args...) end -@kernel function compute_hydrostatic_free_surface_Gv!(Gv, grid, active_cells_map, args) - idx = @index(Global, Linear) - i, j, k = active_linear_index_to_tuple(idx, active_cells_map) - @inbounds Gv[i, j, k] = hydrostatic_free_surface_v_velocity_tendency(i, j, k, grid, args...) -end - ##### ##### Tendency calculators for tracers ##### """ Calculate the right-hand-side of the tracer advection-diffusion equation. """ -@kernel function compute_hydrostatic_free_surface_Gc!(Gc, grid, ::Nothing, args) +@kernel function compute_hydrostatic_free_surface_Gc!(Gc, grid, args) i, j, k = @index(Global, NTuple) @inbounds Gc[i, j, k] = hydrostatic_free_surface_tracer_tendency(i, j, k, grid, args...) end -@kernel function compute_hydrostatic_free_surface_Gc!(Gc, grid, active_cells_map, args) - idx = @index(Global, Linear) - i, j, k = active_linear_index_to_tuple(idx, active_cells_map) - @inbounds Gc[i, j, k] = hydrostatic_free_surface_tracer_tendency(i, j, k, grid, args...) -end - ##### ##### Tendency calculators for an explicit free surface ##### diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl index 3625ae7632..6231e6be23 100644 --- a/src/Utils/kernel_launching.jl +++ b/src/Utils/kernel_launching.jl @@ -7,6 +7,7 @@ using Oceananigans.Architectures using Oceananigans.Grids using Oceananigans.Grids: AbstractGrid using Base: @pure +using KernelAbstractions: Kernel import Oceananigans import KernelAbstractions: get, expand @@ -80,6 +81,18 @@ end contiguousrange(range::NTuple{N, Int}, offset::NTuple{N, Int}) where N = Tuple(1+o:r+o for (r, o) in zip(range, offset)) flatten_reduced_dimensions(worksize, dims) = Tuple(d ∈ dims ? 1 : worksize[d] for d = 1:3) +#### +#### Internal utility to launch a function mapped on an index_map +#### + +struct MappedFunction{M} <: Function + f::Function + index_map::M +end + +@inline (m::MappedFunction)(_ctx_) = m.f(_ctx_) +@inline (m::MappedFunction)(_ctx_, args...) = m.f(_ctx_, args...) + # Support for 1D heuristic_workgroup(Wx) = min(Wx, 256) @@ -238,9 +251,20 @@ the architecture `arch`. dev = Architectures.device(arch) loop = kernel!(dev, workgroup, worksize) + + # Map out the function to use active_cells_map + # as an index map + if !isnothing(active_cells_map) + func = MappedFunction(loop.f, active_cells_map) + param = get_kernel_parameters(loop) + M = typeof(func) + loop = Kernel{param..., M}(dev, func) + end + return loop, worksize end +@inline get_kernel_parameters(k::Kernel{A, B, C}) where {A, B, C} = A, B, C """ launch!(arch, grid, workspec, kernel!, kernel_args...; kw...) @@ -272,10 +296,7 @@ end @inline function _launch!(arch, grid, workspec, kernel!, first_kernel_arg, other_kernel_args...; exclude_periphery = false, reduced_dimensions = (), - active_cells_map = nothing, - # TODO: these two kwargs do nothing: - only_local_halos = false, - async = false) + active_cells_map = nothing) location = Oceananigans.location(first_kernel_arg) @@ -320,6 +341,13 @@ using KernelAbstractions: Kernel using KernelAbstractions.NDIteration: _Size, StaticSize using KernelAbstractions.NDIteration: NDRange +using KernelAbstractions.NDIteration +using KernelAbstractions: ndrange, workgroupsize +import KernelAbstractions: partition + +using KernelAbstractions: CompilerMetadata +import KernelAbstractions: __ndrange, __groupsize + struct OffsetStaticSize{S} <: _Size function OffsetStaticSize{S}() where S new{S::Tuple{Vararg}}() @@ -369,13 +397,6 @@ const OffsetNDRange{N} = NDRange{N, <:StaticSize, <:StaticSize, <:Any, <:KernelO return CartesianIndex(nI) end -using KernelAbstractions.NDIteration -using KernelAbstractions: ndrange, workgroupsize -import KernelAbstractions: partition - -using KernelAbstractions: CompilerMetadata -import KernelAbstractions: __ndrange, __groupsize - @inline __ndrange(::CompilerMetadata{NDRange}) where {NDRange<:OffsetStaticSize} = CartesianIndices(get(NDRange)) @inline __groupsize(cm::CompilerMetadata{NDRange}) where {NDRange<:OffsetStaticSize} = size(__ndrange(cm)) @@ -413,3 +434,47 @@ function partition(kernel::OffsetKernel, inrange, ingroupsize) return iterspace, dynamic end +##### +##### Utilities for Mapped kernels +##### + +struct IndexMap{M} + index_map :: M +end + +const MappedNDRange{N} = NDRange{N, <:StaticSize, <:StaticSize, <:Any, <:IndexMap} where N + +# NDRange has been modified to have offsets in place of workitems: Remember, dynamic offset kernels are not possible with this extension!! +# TODO: maybe don't do this +@inline function expand(ndrange::MappedNDRange{N}, groupidx::CartesianIndex{N}, idx::CartesianIndex{N}) where {N} + nI = ntuple(Val(N)) do I + Base.@_inline_meta + offsets = workitems(ndrange) + stride = size(offsets, I) + gidx = groupidx.I[I] + @inbounds ndrange.workitems.index_map[(gidx - 1) * stride + idx.I[I]] + end + return CartesianIndex(nI...) +end + +const MappedKernel = Kernel{<:Any, <:Any, <:Any, <:MappedFunction} + +# Extending the partition function to include offsets in NDRange: note that in this case the +# offsets take the place of the DynamicWorkitems which we assume is not needed in static kernels +function partition(kernel::MappedKernel, inrange, ingroupsize) + static_workgroupsize = workgroupsize(kernel) + + # Calculate the static NDRange and WorkgroupSize + index_map = kernel.f.index_map + range = length(index_map) + groupsize = get(static_workgroupsize) + + blocks, groupsize, dynamic = NDIteration.partition(range, groupsize) + + static_blocks = StaticSize{blocks} + static_workgroupsize = StaticSize{groupsize} # we might have padded workgroupsize + + iterspace = NDRange{length(range), static_blocks, static_workgroupsize}(blocks, IndexMap(index_map)) + + return iterspace, dynamic +end From 6182e346a056739533e7ac7727a8cf704a48c6db Mon Sep 17 00:00:00 2001 From: Simone Silvestri Date: Tue, 12 Nov 2024 16:49:01 +0100 Subject: [PATCH 02/17] remove double kernels everywhere --- .../hydrostatic_free_surface_ab2_step.jl | 1 - .../split_explicit_free_surface_kernels.jl | 21 +--------- ...ore_hydrostatic_free_surface_tendencies.jl | 1 - .../compute_nonhydrostatic_tendencies.jl | 39 ++++--------------- 4 files changed, 10 insertions(+), 52 deletions(-) diff --git a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_ab2_step.jl b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_ab2_step.jl index c76ac3b525..9cbb8e271f 100644 --- a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_ab2_step.jl +++ b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_ab2_step.jl @@ -1,7 +1,6 @@ using Oceananigans.Fields: location using Oceananigans.TimeSteppers: ab2_step_field! using Oceananigans.TurbulenceClosures: implicit_step! -using Oceananigans.ImmersedBoundaries: retrieve_interior_active_cells_map, retrieve_surface_active_cells_map import Oceananigans.TimeSteppers: ab2_step! diff --git a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl index 0042755123..786f99e129 100644 --- a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl +++ b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl @@ -133,7 +133,7 @@ end # Barotropic Model Kernels # u_Δz = u * Δz -@kernel function _barotropic_mode_kernel!(U, V, grid, ::Nothing, u, v) +@kernel function _barotropic_mode_kernel!(U, V, grid, u, v) i, j = @index(Global, NTuple) k_top = grid.Nz+1 @@ -146,26 +146,9 @@ end end end -# Barotropic Model Kernels -# u_Δz = u * Δz -@kernel function _barotropic_mode_kernel!(U, V, grid, active_cells_map, u, v) - idx = @index(Global, Linear) - i, j = active_linear_index_to_tuple(idx, active_cells_map) - k_top = grid.Nz+1 - - @inbounds U[i, j, k_top-1] = Δzᶠᶜᶜ(i, j, 1, grid) * u[i, j, 1] - @inbounds V[i, j, k_top-1] = Δzᶜᶠᶜ(i, j, 1, grid) * v[i, j, 1] - - for k in 2:grid.Nz - @inbounds U[i, j, k_top-1] += Δzᶠᶜᶜ(i, j, k, grid) * u[i, j, k] - @inbounds V[i, j, k_top-1] += Δzᶜᶠᶜ(i, j, k, grid) * v[i, j, k] - end -end - @inline function compute_barotropic_mode!(U, V, grid, u, v) active_cells_map = retrieve_surface_active_cells_map(grid) - - launch!(architecture(grid), grid, :xy, _barotropic_mode_kernel!, U, V, grid, active_cells_map, u, v; active_cells_map) + launch!(architecture(grid), grid, :xy, _barotropic_mode_kernel!, U, V, grid, u, v; active_cells_map) return nothing end diff --git a/src/Models/HydrostaticFreeSurfaceModels/store_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/store_hydrostatic_free_surface_tendencies.jl index a84d03e13f..089370fdd9 100644 --- a/src/Models/HydrostaticFreeSurfaceModels/store_hydrostatic_free_surface_tendencies.jl +++ b/src/Models/HydrostaticFreeSurfaceModels/store_hydrostatic_free_surface_tendencies.jl @@ -4,7 +4,6 @@ using Oceananigans.TimeSteppers: store_field_tendencies! using Oceananigans: prognostic_fields using Oceananigans.Grids: AbstractGrid -using Oceananigans.ImmersedBoundaries: retrieve_interior_active_cells_map using Oceananigans.Utils: launch! diff --git a/src/Models/NonhydrostaticModels/compute_nonhydrostatic_tendencies.jl b/src/Models/NonhydrostaticModels/compute_nonhydrostatic_tendencies.jl index ffa7865bb2..67bcbbbb98 100644 --- a/src/Models/NonhydrostaticModels/compute_nonhydrostatic_tendencies.jl +++ b/src/Models/NonhydrostaticModels/compute_nonhydrostatic_tendencies.jl @@ -103,15 +103,15 @@ function compute_interior_tendency_contributions!(model, kernel_parameters; acti exclude_periphery = true launch!(arch, grid, kernel_parameters, compute_Gu!, - tendencies.u, grid, active_cells_map, u_kernel_args; + tendencies.u, grid, u_kernel_args; active_cells_map, exclude_periphery) launch!(arch, grid, kernel_parameters, compute_Gv!, - tendencies.v, grid, active_cells_map, v_kernel_args; + tendencies.v, grid, v_kernel_args; active_cells_map, exclude_periphery) launch!(arch, grid, kernel_parameters, compute_Gw!, - tendencies.w, grid, active_cells_map, w_kernel_args; + tendencies.w, grid, w_kernel_args; active_cells_map, exclude_periphery) start_tracer_kernel_args = (advection, closure) @@ -131,7 +131,7 @@ function compute_interior_tendency_contributions!(model, kernel_parameters; acti forcing, clock) launch!(arch, grid, kernel_parameters, compute_Gc!, - c_tendency, grid, active_cells_map, args; + c_tendency, grid, args; active_cells_map) end @@ -143,57 +143,34 @@ end ##### """ Calculate the right-hand-side of the u-velocity equation. """ -@kernel function compute_Gu!(Gu, grid, ::Nothing, args) +@kernel function compute_Gu!(Gu, grid, args) i, j, k = @index(Global, NTuple) @inbounds Gu[i, j, k] = u_velocity_tendency(i, j, k, grid, args...) end -@kernel function compute_Gu!(Gu, grid, interior_map, args) - idx = @index(Global, Linear) - i, j, k = active_linear_index_to_tuple(idx, interior_map) - @inbounds Gu[i, j, k] = u_velocity_tendency(i, j, k, grid, args...) -end - """ Calculate the right-hand-side of the v-velocity equation. """ -@kernel function compute_Gv!(Gv, grid, ::Nothing, args) +@kernel function compute_Gv!(Gv, grid, args) i, j, k = @index(Global, NTuple) @inbounds Gv[i, j, k] = v_velocity_tendency(i, j, k, grid, args...) end -@kernel function compute_Gv!(Gv, grid, interior_map, args) - idx = @index(Global, Linear) - i, j, k = active_linear_index_to_tuple(idx, interior_map) - @inbounds Gv[i, j, k] = v_velocity_tendency(i, j, k, grid, args...) -end - """ Calculate the right-hand-side of the w-velocity equation. """ -@kernel function compute_Gw!(Gw, grid, ::Nothing, args) +@kernel function compute_Gw!(Gw, grid, args) i, j, k = @index(Global, NTuple) @inbounds Gw[i, j, k] = w_velocity_tendency(i, j, k, grid, args...) end -@kernel function compute_Gw!(Gw, grid, interior_map, args) - idx = @index(Global, Linear) - i, j, k = active_linear_index_to_tuple(idx, interior_map) - @inbounds Gw[i, j, k] = w_velocity_tendency(i, j, k, grid, args...) -end ##### ##### Tracer(s) ##### """ Calculate the right-hand-side of the tracer advection-diffusion equation. """ -@kernel function compute_Gc!(Gc, grid, ::Nothing, args) +@kernel function compute_Gc!(Gc, grid, args) i, j, k = @index(Global, NTuple) @inbounds Gc[i, j, k] = tracer_tendency(i, j, k, grid, args...) end -@kernel function compute_Gc!(Gc, grid, interior_map, args) - idx = @index(Global, Linear) - i, j, k = active_linear_index_to_tuple(idx, interior_map) - @inbounds Gc[i, j, k] = tracer_tendency(i, j, k, grid, args...) -end - ##### ##### Boundary contributions to tendencies due to user-prescribed fluxes ##### From 5545605ae1631221fb88d7530e06bde8df6bc2f2 Mon Sep 17 00:00:00 2001 From: Simone Silvestri Date: Tue, 12 Nov 2024 16:55:18 +0100 Subject: [PATCH 03/17] make sure also GPU works --- src/Utils/kernel_launching.jl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl index 6231e6be23..c81a31369f 100644 --- a/src/Utils/kernel_launching.jl +++ b/src/Utils/kernel_launching.jl @@ -6,6 +6,7 @@ using Oceananigans: location using Oceananigans.Architectures using Oceananigans.Grids using Oceananigans.Grids: AbstractGrid +using Adapt using Base: @pure using KernelAbstractions: Kernel @@ -90,6 +91,9 @@ struct MappedFunction{M} <: Function index_map::M end +Adapt.adapt_structure(to, m::MappedFunction) = + MappedFunction(Adapt.adapt(to, m.f), Adapt.adapt(to, m.index_map)) + @inline (m::MappedFunction)(_ctx_) = m.f(_ctx_) @inline (m::MappedFunction)(_ctx_, args...) = m.f(_ctx_, args...) From e239dbed09bdf899e7bda469cafab9d3e0822154 Mon Sep 17 00:00:00 2001 From: Simone Silvestri Date: Tue, 12 Nov 2024 16:59:30 +0100 Subject: [PATCH 04/17] make sure there is no return in the function --- src/Utils/kernel_launching.jl | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl index c81a31369f..7b1409b392 100644 --- a/src/Utils/kernel_launching.jl +++ b/src/Utils/kernel_launching.jl @@ -94,8 +94,13 @@ end Adapt.adapt_structure(to, m::MappedFunction) = MappedFunction(Adapt.adapt(to, m.f), Adapt.adapt(to, m.index_map)) -@inline (m::MappedFunction)(_ctx_) = m.f(_ctx_) -@inline (m::MappedFunction)(_ctx_, args...) = m.f(_ctx_, args...) +@inline function (m::MappedFunction)(_ctx_) + m.f(_ctx_) +end + +@inline function (m::MappedFunction)(_ctx_, args...) + m.f(_ctx_, args...) +end # Support for 1D heuristic_workgroup(Wx) = min(Wx, 256) From 32e0c524eb617a75da7ff1841db4a3c0cdfa1f17 Mon Sep 17 00:00:00 2001 From: Simone Silvestri Date: Tue, 12 Nov 2024 17:00:15 +0100 Subject: [PATCH 05/17] return nothing --- src/Utils/kernel_launching.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl index 7b1409b392..9084639507 100644 --- a/src/Utils/kernel_launching.jl +++ b/src/Utils/kernel_launching.jl @@ -96,10 +96,12 @@ Adapt.adapt_structure(to, m::MappedFunction) = @inline function (m::MappedFunction)(_ctx_) m.f(_ctx_) + return nothing end @inline function (m::MappedFunction)(_ctx_, args...) m.f(_ctx_, args...) + return nothing end # Support for 1D From 0b56304b347c5961b920aaa50b50600b4bfd69ba Mon Sep 17 00:00:00 2001 From: Simone Silvestri Date: Tue, 12 Nov 2024 17:26:44 +0100 Subject: [PATCH 06/17] adapt for GPU usage --- src/BoundaryConditions/fill_halo_regions.jl | 6 ++++- src/Utils/kernel_launching.jl | 28 ++++++++++++--------- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/src/BoundaryConditions/fill_halo_regions.jl b/src/BoundaryConditions/fill_halo_regions.jl index fa380e7029..50ed69ef46 100644 --- a/src/BoundaryConditions/fill_halo_regions.jl +++ b/src/BoundaryConditions/fill_halo_regions.jl @@ -46,7 +46,11 @@ const MaybeTupledData = Union{OffsetArray, NTuple{<:Any, OffsetArray}} "Fill halo regions in ``x``, ``y``, and ``z`` for a given field's data." function fill_halo_regions!(c::MaybeTupledData, boundary_conditions, indices, loc, grid, args...; - fill_boundary_normal_velocities = true, kwargs...) + fill_boundary_normal_velocities = true, + only_local_halos = false, # Only valid for `DistributedGrids`, we throw it away here + async = false, # Only valid for `DistributedGrids`, we throw it away here + kwargs...) + arch = architecture(grid) if fill_boundary_normal_velocities diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl index 9084639507..9ca6fb5371 100644 --- a/src/Utils/kernel_launching.jl +++ b/src/Utils/kernel_launching.jl @@ -86,8 +86,8 @@ flatten_reduced_dimensions(worksize, dims) = Tuple(d ∈ dims ? 1 : worksize[d] #### Internal utility to launch a function mapped on an index_map #### -struct MappedFunction{M} <: Function - f::Function +struct MappedFunction{F, M} <: Function + f::F index_map::M end @@ -453,19 +453,21 @@ struct IndexMap{M} index_map :: M end +Adapt.adapt_structure(to, m::IndexMap) = IndexMap(Adapt.adapt(to, m.index_map)) + const MappedNDRange{N} = NDRange{N, <:StaticSize, <:StaticSize, <:Any, <:IndexMap} where N -# NDRange has been modified to have offsets in place of workitems: Remember, dynamic offset kernels are not possible with this extension!! +# NDRange has been modified to include an index_map in place of workitems: R +# Remember, dynamic offset kernels are not possible with this extension!! +# Also, mapped kernels work only with a 1D kernel and a 1D map, it is not possible to launch a ND kernel. # TODO: maybe don't do this -@inline function expand(ndrange::MappedNDRange{N}, groupidx::CartesianIndex{N}, idx::CartesianIndex{N}) where {N} - nI = ntuple(Val(N)) do I - Base.@_inline_meta - offsets = workitems(ndrange) - stride = size(offsets, I) - gidx = groupidx.I[I] - @inbounds ndrange.workitems.index_map[(gidx - 1) * stride + idx.I[I]] - end - return CartesianIndex(nI...) +@inline function expand(ndrange::MappedNDRange, groupidx::CartesianIndex{N}, idx::CartesianIndex{N}) + Base.@_inline_meta + offsets = workitems(ndrange) + stride = size(offsets, I) + gidx = groupidx.I[I] + @inbounds ndrange.workitems.index_map[(gidx - 1) * stride + idx.I[I]] + return CartesianIndex(nI) end const MappedKernel = Kernel{<:Any, <:Any, <:Any, <:MappedFunction} @@ -478,6 +480,7 @@ function partition(kernel::MappedKernel, inrange, ingroupsize) # Calculate the static NDRange and WorkgroupSize index_map = kernel.f.index_map range = length(index_map) + arch = Oceananigans.Architectures.architecture(index_map) groupsize = get(static_workgroupsize) blocks, groupsize, dynamic = NDIteration.partition(range, groupsize) @@ -485,6 +488,7 @@ function partition(kernel::MappedKernel, inrange, ingroupsize) static_blocks = StaticSize{blocks} static_workgroupsize = StaticSize{groupsize} # we might have padded workgroupsize + index_map = Oceananigans.Architectures.convert_args(arch, index_map) iterspace = NDRange{length(range), static_blocks, static_workgroupsize}(blocks, IndexMap(index_map)) return iterspace, dynamic From 052f56e5c272de60c9cbac2e94833541c88df320 Mon Sep 17 00:00:00 2001 From: Simone Silvestri Date: Tue, 12 Nov 2024 17:43:28 +0100 Subject: [PATCH 07/17] not sure about the GPU --- src/Utils/kernel_launching.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl index 9ca6fb5371..41e4770dee 100644 --- a/src/Utils/kernel_launching.jl +++ b/src/Utils/kernel_launching.jl @@ -461,12 +461,12 @@ const MappedNDRange{N} = NDRange{N, <:StaticSize, <:StaticSize, <:Any, <:IndexMa # Remember, dynamic offset kernels are not possible with this extension!! # Also, mapped kernels work only with a 1D kernel and a 1D map, it is not possible to launch a ND kernel. # TODO: maybe don't do this -@inline function expand(ndrange::MappedNDRange, groupidx::CartesianIndex{N}, idx::CartesianIndex{N}) +@inline function expand(ndrange::MappedNDRange, groupidx::CartesianIndex, idx::CartesianIndex) Base.@_inline_meta offsets = workitems(ndrange) - stride = size(offsets, I) - gidx = groupidx.I[I] - @inbounds ndrange.workitems.index_map[(gidx - 1) * stride + idx.I[I]] + stride = size(offsets, 1) + gidx = groupidx.I[1] + @inbounds ndrange.workitems.index_map[(gidx - 1) * stride + idx.I[1]] return CartesianIndex(nI) end From d0d56ac02dc785a81b01d2dd0bcf605f625f78f7 Mon Sep 17 00:00:00 2001 From: simone-silvestri Date: Tue, 12 Nov 2024 12:47:31 -0500 Subject: [PATCH 08/17] hmmm --- src/Utils/kernel_launching.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl index 41e4770dee..667f6ab1ea 100644 --- a/src/Utils/kernel_launching.jl +++ b/src/Utils/kernel_launching.jl @@ -466,7 +466,7 @@ const MappedNDRange{N} = NDRange{N, <:StaticSize, <:StaticSize, <:Any, <:IndexMa offsets = workitems(ndrange) stride = size(offsets, 1) gidx = groupidx.I[1] - @inbounds ndrange.workitems.index_map[(gidx - 1) * stride + idx.I[1]] + nI = @inbounds ndrange.workitems.index_map[(gidx - 1) * stride + idx.I[1]] return CartesianIndex(nI) end From 7e41737b458c00e8767f31caba685d683a4c6fe0 Mon Sep 17 00:00:00 2001 From: Simone Silvestri Date: Tue, 12 Nov 2024 19:56:08 +0100 Subject: [PATCH 09/17] probably like this it will work on the GPU? --- src/Utils/kernel_launching.jl | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl index 41e4770dee..65025f0b37 100644 --- a/src/Utils/kernel_launching.jl +++ b/src/Utils/kernel_launching.jl @@ -88,11 +88,11 @@ flatten_reduced_dimensions(worksize, dims) = Tuple(d ∈ dims ? 1 : worksize[d] struct MappedFunction{F, M} <: Function f::F - index_map::M + imap::M end Adapt.adapt_structure(to, m::MappedFunction) = - MappedFunction(Adapt.adapt(to, m.f), Adapt.adapt(to, m.index_map)) + MappedFunction(Adapt.adapt(to, m.f), Adapt.adapt(to, m.imap)) @inline function (m::MappedFunction)(_ctx_) m.f(_ctx_) @@ -466,11 +466,18 @@ const MappedNDRange{N} = NDRange{N, <:StaticSize, <:StaticSize, <:Any, <:IndexMa offsets = workitems(ndrange) stride = size(offsets, 1) gidx = groupidx.I[1] - @inbounds ndrange.workitems.index_map[(gidx - 1) * stride + idx.I[1]] + tI = (gidx - 1) * stride + idx.I[1] + nI = ndrange.workitems.index_map[tI] return CartesianIndex(nI) end -const MappedKernel = Kernel{<:Any, <:Any, <:Any, <:MappedFunction} +const MappedKernel{D} = Kernel{D, <:Any, <:Any, <:MappedFunction} where D + +# Override the getproperty to make sure we get the correct properties +@inline getproperty(k::MappedKernel, prop::Symbol) = get_mapped_property(k, Val(prop)) + +@inline get_mapped_property(k, ::Val{:imap}) = k.f.imap +@inline get_mapped_property(k, ::Val{:f}) = k.f.f # Extending the partition function to include offsets in NDRange: note that in this case the # offsets take the place of the DynamicWorkitems which we assume is not needed in static kernels @@ -478,7 +485,7 @@ function partition(kernel::MappedKernel, inrange, ingroupsize) static_workgroupsize = workgroupsize(kernel) # Calculate the static NDRange and WorkgroupSize - index_map = kernel.f.index_map + index_map = getproperty(kernel, :imap) range = length(index_map) arch = Oceananigans.Architectures.architecture(index_map) groupsize = get(static_workgroupsize) @@ -492,4 +499,4 @@ function partition(kernel::MappedKernel, inrange, ingroupsize) iterspace = NDRange{length(range), static_blocks, static_workgroupsize}(blocks, IndexMap(index_map)) return iterspace, dynamic -end +end \ No newline at end of file From 130a3571732bbe3d28de5dc388591ec71352efe8 Mon Sep 17 00:00:00 2001 From: Simone Silvestri Date: Tue, 12 Nov 2024 21:35:57 +0100 Subject: [PATCH 10/17] some cleanup plus testing a test? --- src/Utils/kernel_launching.jl | 39 ++++++++++++++--------------------- test.jl | 18 ++++++++++++++++ 2 files changed, 33 insertions(+), 24 deletions(-) create mode 100644 test.jl diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl index b158fe09a4..436cde1037 100644 --- a/src/Utils/kernel_launching.jl +++ b/src/Utils/kernel_launching.jl @@ -87,20 +87,15 @@ flatten_reduced_dimensions(worksize, dims) = Tuple(d ∈ dims ? 1 : worksize[d] #### struct MappedFunction{F, M} <: Function - f::F - imap::M + func::F + index_map::M end Adapt.adapt_structure(to, m::MappedFunction) = - MappedFunction(Adapt.adapt(to, m.f), Adapt.adapt(to, m.imap)) - -@inline function (m::MappedFunction)(_ctx_) - m.f(_ctx_) - return nothing -end + MappedFunction(Adapt.adapt(to, m.func), Adapt.adapt(to, m.index_map)) @inline function (m::MappedFunction)(_ctx_, args...) - m.f(_ctx_, args...) + m.func(_ctx_, args...) return nothing end @@ -251,6 +246,7 @@ the architecture `arch`. location = nothing, active_cells_map = nothing) + if !isnothing(active_cells_map) # everything else is irrelevant workgroup = min(length(active_cells_map), 256) worksize = length(active_cells_map) @@ -263,19 +259,16 @@ the architecture `arch`. dev = Architectures.device(arch) loop = kernel!(dev, workgroup, worksize) - # Map out the function to use active_cells_map - # as an index map + # Map out the function to use active_cells_map as an index map if !isnothing(active_cells_map) - func = MappedFunction(loop.f, active_cells_map) - param = get_kernel_parameters(loop) - M = typeof(func) - loop = Kernel{param..., M}(dev, func) + func = MappedFunction(loop.f, active_cells_map) + loop = Kernel{get_kernel_parameters(loop)..., typeof(func)}(dev, func) end return loop, worksize end -@inline get_kernel_parameters(k::Kernel{A, B, C}) where {A, B, C} = A, B, C +@inline get_kernel_parameters(::Kernel{Dev, B, W}) where {Dev, B, W} = Dev, B, W """ launch!(arch, grid, workspec, kernel!, kernel_args...; kw...) @@ -347,8 +340,6 @@ end ##### # TODO: when offsets are implemented in KA so that we can call `kernel(dev, group, size, offsets)`, remove all of this - -using KernelAbstractions: Kernel using KernelAbstractions.NDIteration: _Size, StaticSize using KernelAbstractions.NDIteration: NDRange @@ -450,14 +441,14 @@ end ##### struct IndexMap{M} - imap :: M + index_map :: M end Adapt.adapt_structure(to, m::IndexMap) = IndexMap(Adapt.adapt(to, m.index_map)) const MappedNDRange{N} = NDRange{N, <:StaticSize, <:StaticSize, <:Any, <:IndexMap} where N -# NDRange has been modified to include an index_map in place of workitems: R +# NDRange has been modified to include an index_map in place of workitems. # Remember, dynamic offset kernels are not possible with this extension!! # Also, mapped kernels work only with a 1D kernel and a 1D map, it is not possible to launch a ND kernel. # TODO: maybe don't do this @@ -467,7 +458,7 @@ const MappedNDRange{N} = NDRange{N, <:StaticSize, <:StaticSize, <:Any, <:IndexMa stride = size(offsets, 1) gidx = groupidx.I[1] tI = (gidx - 1) * stride + idx.I[1] - nI = ndrange.workitems.imap[tI] + nI = ndrange.workitems.index_map[tI] return CartesianIndex(nI) end @@ -476,8 +467,8 @@ const MappedKernel{D} = Kernel{D, <:Any, <:Any, <:MappedFunction} where D # Override the getproperty to make sure we get the correct properties @inline getproperty(k::MappedKernel, prop::Symbol) = get_mapped_property(k, Val(prop)) -@inline get_mapped_property(k, ::Val{:imap}) = k.f.imap -@inline get_mapped_property(k, ::Val{:f}) = k.f.f +@inline get_mapped_property(k, ::Val{:index_map}) = k.f.index_map +@inline get_mapped_property(k, ::Val{:func}) = k.f.func # Extending the partition function to include offsets in NDRange: note that in this case the # offsets take the place of the DynamicWorkitems which we assume is not needed in static kernels @@ -485,7 +476,7 @@ function partition(kernel::MappedKernel, inrange, ingroupsize) static_workgroupsize = workgroupsize(kernel) # Calculate the static NDRange and WorkgroupSize - index_map = getproperty(kernel, :imap) + index_map = getproperty(kernel, :index_map) range = length(index_map) arch = Oceananigans.Architectures.architecture(index_map) groupsize = get(static_workgroupsize) diff --git a/test.jl b/test.jl new file mode 100644 index 0000000000..70239fa292 --- /dev/null +++ b/test.jl @@ -0,0 +1,18 @@ +using Revise +using Oceananigans +using Oceananigans.Utils + +using KernelAbstractions: @index, @kernel + +arch = CPU() + +@kernel function _test_indices(a) + i, j, k = @index(Global, NTuple) + a[i, j, k] = i + j + k +end + +grid = RectilinearGrid(arch, size = (3, 3, 3), extent = (1, 1, 1)) +array = zeros(arch, 3, 3, 3) +imap = on_architecture(arch, [(i, j, k) for i in 1:3, j in 1:3, k in 1:2]) + +launch!(arch, grid_cpu, :xyz, _test_indices, array; active_cells_map = imap) \ No newline at end of file From 3be13dd0860740de74fa91b5df2532a623df1ed9 Mon Sep 17 00:00:00 2001 From: simone-silvestri Date: Tue, 12 Nov 2024 18:40:50 -0500 Subject: [PATCH 11/17] works on gpu, to fix the dynamic check --- src/Utils/kernel_launching.jl | 91 +++++++++++++++++++++-------------- 1 file changed, 56 insertions(+), 35 deletions(-) diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl index 436cde1037..02473f0d5b 100644 --- a/src/Utils/kernel_launching.jl +++ b/src/Utils/kernel_launching.jl @@ -12,6 +12,7 @@ using KernelAbstractions: Kernel import Oceananigans import KernelAbstractions: get, expand +import Base struct KernelParameters{S, O} end @@ -82,23 +83,12 @@ end contiguousrange(range::NTuple{N, Int}, offset::NTuple{N, Int}) where N = Tuple(1+o:r+o for (r, o) in zip(range, offset)) flatten_reduced_dimensions(worksize, dims) = Tuple(d ∈ dims ? 1 : worksize[d] for d = 1:3) -#### -#### Internal utility to launch a function mapped on an index_map -#### - +# Internal utility to launch a function mapped on an index_map struct MappedFunction{F, M} <: Function func::F index_map::M end -Adapt.adapt_structure(to, m::MappedFunction) = - MappedFunction(Adapt.adapt(to, m.func), Adapt.adapt(to, m.index_map)) - -@inline function (m::MappedFunction)(_ctx_, args...) - m.func(_ctx_, args...) - return nothing -end - # Support for 1D heuristic_workgroup(Wx) = min(Wx, 256) @@ -440,35 +430,41 @@ end ##### Utilities for Mapped kernels ##### -struct IndexMap{M} - index_map :: M -end - -Adapt.adapt_structure(to, m::IndexMap) = IndexMap(Adapt.adapt(to, m.index_map)) +struct IndexMap end -const MappedNDRange{N} = NDRange{N, <:StaticSize, <:StaticSize, <:Any, <:IndexMap} where N +const MappedNDRange{N} = NDRange{N, <:StaticSize, <:StaticSize, <:IndexMap, <:AbstractArray} where N # NDRange has been modified to include an index_map in place of workitems. # Remember, dynamic offset kernels are not possible with this extension!! # Also, mapped kernels work only with a 1D kernel and a 1D map, it is not possible to launch a ND kernel. # TODO: maybe don't do this -@inline function expand(ndrange::MappedNDRange, groupidx::CartesianIndex, idx::CartesianIndex) - Base.@_inline_meta - offsets = workitems(ndrange) - stride = size(offsets, 1) - gidx = groupidx.I[1] - tI = (gidx - 1) * stride + idx.I[1] - nI = ndrange.workitems.index_map[tI] - return CartesianIndex(nI) +@inline function expand(ndrange::MappedNDRange, groupidx::CartesianIndex{N}, idx::CartesianIndex{N}) where N + nI = ntuple(Val(N)) do I + Base.@_inline_meta + offsets = workitems(ndrange) + stride = size(offsets, I) + gidx = groupidx.I[I] + ndrange.workitems[(gidx - 1) * stride + idx.I[I]] + end + return CartesianIndex(nI...) end const MappedKernel{D} = Kernel{D, <:Any, <:Any, <:MappedFunction} where D -# Override the getproperty to make sure we get the correct properties -@inline getproperty(k::MappedKernel, prop::Symbol) = get_mapped_property(k, Val(prop)) +# Override the getproperty to make sure we launch the correct function in the kernel +@inline Base.getproperty(k::MappedKernel, prop::Symbol) = get_mapped_kernel_property(k, Val(prop)) + +@inline get_mapped_kernel_property(k, ::Val{prop}) where prop = getfield(k, prop) +@inline get_mapped_kernel_property(k, ::Val{:index_map}) = getfield(getfield(k, :f), :index_map) +@inline get_mapped_kernel_property(k, ::Val{:f}) = getfield(getfield(k, :f), :func) -@inline get_mapped_property(k, ::Val{:index_map}) = k.f.index_map -@inline get_mapped_property(k, ::Val{:func}) = k.f.func +Adapt.adapt_structure(to, cm::CompilerMetadata{N, C}) where {N, C} = + CompilerMetadata{N, C}(Adapt.adapt(to, cm.groupindex), + Adapt.adapt(to, cm.ndrange), + Adapt.adapt(to, cm.iterspace)) + +Adapt.adapt_structure(to, ndrange::NDRange{N, B, W}) where {N, B, W} = + NDRange{N, B, W}(Adapt.adapt(to, ndrange.blocks), Adapt.adapt(to, ndrange.workitems)) # Extending the partition function to include offsets in NDRange: note that in this case the # offsets take the place of the DynamicWorkitems which we assume is not needed in static kernels @@ -476,9 +472,8 @@ function partition(kernel::MappedKernel, inrange, ingroupsize) static_workgroupsize = workgroupsize(kernel) # Calculate the static NDRange and WorkgroupSize - index_map = getproperty(kernel, :index_map) + index_map = kernel.index_map range = length(index_map) - arch = Oceananigans.Architectures.architecture(index_map) groupsize = get(static_workgroupsize) blocks, groupsize, dynamic = NDIteration.partition(range, groupsize) @@ -486,8 +481,34 @@ function partition(kernel::MappedKernel, inrange, ingroupsize) static_blocks = StaticSize{blocks} static_workgroupsize = StaticSize{groupsize} # we might have padded workgroupsize - index_map = Oceananigans.Architectures.convert_args(arch, index_map) - iterspace = NDRange{length(range), static_blocks, static_workgroupsize}(blocks, IndexMap(index_map)) + iterspace = NDRange{length(range), static_blocks, static_workgroupsize}(IndexMap(), index_map) return iterspace, dynamic -end \ No newline at end of file +end + +using KernelAbstractions: CompilerMetadata, NoDynamicCheck +using CUDA: CUDABackend + +import KernelAbstractions: mkcontext, __dynamic_checkbounds, __validindex + +# Very dangerous override of mkcontext which will not work if we are not +# carefull with making sure that indices are correct when launching a `MappedKernel` +# TODO: Definitely change this with options below +function mkcontext(kernel::Kernel{CUDABackend}, _ndrange, iterspace::MappedNDRange) + return CompilerMetadata{ndrange(kernel), NoDynamicCheck}(_ndrange, iterspace) +end + +# Alternative to the above to fix: +# const MappedCompilerMetadata = CompilerMetadata{<:Any, <:Any, <:Any, <:Any, <:MappedNDRange} + +# @inline __ndrange(cm::MappedCompilerMetadata) = cm.iterspace + +# @inline function __validindex(ctx::MappedCompilerMetadata, idx::CartesianIndex) +# # Turns this into a noop for code where we can turn of checkbounds of +# if __dynamic_checkbounds(ctx) +# I = @inbounds expand(__iterspace(ctx), __groupindex(ctx), idx) +# return I in __ndrange(ctx) +# else +# return true +# end +# end \ No newline at end of file From 64cad183deccf3fd500422ec7868c5551904eb24 Mon Sep 17 00:00:00 2001 From: Simone Silvestri Date: Wed, 13 Nov 2024 08:36:26 +0100 Subject: [PATCH 12/17] move lwargs --- src/BoundaryConditions/fill_halo_regions.jl | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/BoundaryConditions/fill_halo_regions.jl b/src/BoundaryConditions/fill_halo_regions.jl index 50ed69ef46..c36c9c3fbf 100644 --- a/src/BoundaryConditions/fill_halo_regions.jl +++ b/src/BoundaryConditions/fill_halo_regions.jl @@ -47,8 +47,6 @@ const MaybeTupledData = Union{OffsetArray, NTuple{<:Any, OffsetArray}} "Fill halo regions in ``x``, ``y``, and ``z`` for a given field's data." function fill_halo_regions!(c::MaybeTupledData, boundary_conditions, indices, loc, grid, args...; fill_boundary_normal_velocities = true, - only_local_halos = false, # Only valid for `DistributedGrids`, we throw it away here - async = false, # Only valid for `DistributedGrids`, we throw it away here kwargs...) arch = architecture(grid) @@ -68,7 +66,10 @@ function fill_halo_regions!(c::MaybeTupledData, boundary_conditions, indices, lo return nothing end -function fill_halo_event!(c, fill_halos!, bcs, indices, loc, arch, grid, args...; kwargs...) +function fill_halo_event!(c, fill_halos!, bcs, indices, loc, arch, grid, args...; + only_local_halos = false, # Only valid for `DistributedGrids`, we throw it away here + async = false, # Only valid for `DistributedGrids`, we throw it away here + kwargs...) # Calculate size and offset of the fill_halo kernel # We assume that the kernel size is the same for west and east boundaries, From 333bfd707056483e19f99c4fca1ea28a8a78f9e2 Mon Sep 17 00:00:00 2001 From: Simone Silvestri Date: Wed, 13 Nov 2024 09:12:08 +0100 Subject: [PATCH 13/17] kwarg management --- src/BoundaryConditions/fill_halo_regions.jl | 28 +++++++++++++-------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/src/BoundaryConditions/fill_halo_regions.jl b/src/BoundaryConditions/fill_halo_regions.jl index c36c9c3fbf..686f91fd6f 100644 --- a/src/BoundaryConditions/fill_halo_regions.jl +++ b/src/BoundaryConditions/fill_halo_regions.jl @@ -67,7 +67,6 @@ function fill_halo_regions!(c::MaybeTupledData, boundary_conditions, indices, lo end function fill_halo_event!(c, fill_halos!, bcs, indices, loc, arch, grid, args...; - only_local_halos = false, # Only valid for `DistributedGrids`, we throw it away here async = false, # Only valid for `DistributedGrids`, we throw it away here kwargs...) @@ -300,27 +299,27 @@ end ##### Kernel launchers for single-sided fill_halos ##### -fill_west_halo!(c, bc, size, offset, loc, arch, grid, args...; kwargs...) = +fill_west_halo!(c, bc, size, offset, loc, arch, grid, args...; only_local_halos = false, kwargs...) = launch!(arch, grid, KernelParameters(size, offset), _fill_only_west_halo!, c, bc, loc, grid, Tuple(args); kwargs...) -fill_east_halo!(c, bc, size, offset, loc, arch, grid, args...; kwargs...) = +fill_east_halo!(c, bc, size, offset, loc, arch, grid, args...; only_local_halos = false, kwargs...) = launch!(arch, grid, KernelParameters(size, offset), _fill_only_east_halo!, c, bc, loc, grid, Tuple(args); kwargs...) -fill_south_halo!(c, bc, size, offset, loc, arch, grid, args...; kwargs...) = +fill_south_halo!(c, bc, size, offset, loc, arch, grid, args...; only_local_halos = false, kwargs...) = launch!(arch, grid, KernelParameters(size, offset), _fill_only_south_halo!, c, bc, loc, grid, Tuple(args); kwargs...) -fill_north_halo!(c, bc, size, offset, loc, arch, grid, args...; kwargs...) = +fill_north_halo!(c, bc, size, offset, loc, arch, grid, args...; only_local_halos = false, kwargs...) = launch!(arch, grid, KernelParameters(size, offset), _fill_only_north_halo!, c, bc, loc, grid, Tuple(args); kwargs...) -fill_bottom_halo!(c, bc, size, offset, loc, arch, grid, args...; kwargs...) = +fill_bottom_halo!(c, bc, size, offset, loc, arch, grid, args...; only_local_halos = false, kwargs...) = launch!(arch, grid, KernelParameters(size, offset), _fill_only_bottom_halo!, c, bc, loc, grid, Tuple(args); kwargs...) -fill_top_halo!(c, bc, size, offset, loc, arch, grid, args...; kwargs...) = +fill_top_halo!(c, bc, size, offset, loc, arch, grid, args...; only_local_halos = false, kwargs...) = launch!(arch, grid, KernelParameters(size, offset), _fill_only_top_halo!, c, bc, loc, grid, Tuple(args); kwargs...) @@ -328,17 +327,26 @@ fill_top_halo!(c, bc, size, offset, loc, arch, grid, args...; kwargs...) = ##### Kernel launchers for double-sided fill_halos ##### -function fill_west_and_east_halo!(c, west_bc, east_bc, size, offset, loc, arch, grid, args...; kwargs...) +function fill_west_and_east_halo!(c, west_bc, east_bc, size, offset, loc, arch, grid, args...; + only_local_halos = false, # Only valid for `DistributedGrids`, we throw it away here + kwargs...) + return launch!(arch, grid, KernelParameters(size, offset), _fill_west_and_east_halo!, c, west_bc, east_bc, loc, grid, Tuple(args); kwargs...) end -function fill_south_and_north_halo!(c, south_bc, north_bc, size, offset, loc, arch, grid, args...; kwargs...) +function fill_south_and_north_halo!(c, south_bc, north_bc, size, offset, loc, arch, grid, args...; + only_local_halos = false, # Only valid for `DistributedGrids`, we throw it away here + kwargs...) + return launch!(arch, grid, KernelParameters(size, offset), _fill_south_and_north_halo!, c, south_bc, north_bc, loc, grid, Tuple(args); kwargs...) end -function fill_bottom_and_top_halo!(c, bottom_bc, top_bc, size, offset, loc, arch, grid, args...; kwargs...) +function fill_bottom_and_top_halo!(c, bottom_bc, top_bc, size, offset, loc, arch, grid, args...; + only_local_halos = false, # Only valid for `DistributedGrids`, we throw it away here + kwargs...) + return launch!(arch, grid, KernelParameters(size, offset), _fill_bottom_and_top_halo!, c, bottom_bc, top_bc, loc, grid, Tuple(args); kwargs...) end From e069051e1926705e620abd8ab93b2b2bcd068a44 Mon Sep 17 00:00:00 2001 From: Simone Silvestri Date: Wed, 13 Nov 2024 09:33:39 +0100 Subject: [PATCH 14/17] reduce time for tridiagonal solve --- src/Solvers/batched_tridiagonal_solver.jl | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/Solvers/batched_tridiagonal_solver.jl b/src/Solvers/batched_tridiagonal_solver.jl index 2886847938..4e9aca12f3 100644 --- a/src/Solvers/batched_tridiagonal_solver.jl +++ b/src/Solvers/batched_tridiagonal_solver.jl @@ -99,13 +99,16 @@ Press William, H., Teukolsky Saul, A., Vetterling William, T., & Flannery Brian, """ function solve!(ϕ, solver::BatchedTridiagonalSolver, rhs, args...) - launch_config = if solver.tridiagonal_direction isa XDirection - :yz - elseif solver.tridiagonal_direction isa YDirection - :xz - elseif solver.tridiagonal_direction isa ZDirection - :xy - end + if solver.tridiagonal_direction isa XDirection + launch_config = :yz + active_cells_map = nothing + elseif solver.tridiagonal_direction isa YDirection + launch_config = :xz + active_cells_map = nothing + elseif solver.tridiagonal_direction isa ZDirection + launch_config = :xy + active_cells_map = retrieve_surface_active_cells_map(grid) + end launch!(architecture(solver), solver.grid, launch_config, solve_batched_tridiagonal_system_kernel!, ϕ, @@ -117,7 +120,8 @@ function solve!(ϕ, solver::BatchedTridiagonalSolver, rhs, args...) solver.grid, solver.parameters, Tuple(args), - solver.tridiagonal_direction) + solver.tridiagonal_direction; + active_cells_map) return nothing end From f0a887a4094b31f4121d6269198988f8dc48d3fc Mon Sep 17 00:00:00 2001 From: Simone Silvestri Date: Wed, 13 Nov 2024 13:06:24 +0100 Subject: [PATCH 15/17] see if tests pass (especially distributed where we have active_cells) --- src/BoundaryConditions/fill_halo_regions_periodic.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/BoundaryConditions/fill_halo_regions_periodic.jl b/src/BoundaryConditions/fill_halo_regions_periodic.jl index 890b9c2fba..edd97528fa 100644 --- a/src/BoundaryConditions/fill_halo_regions_periodic.jl +++ b/src/BoundaryConditions/fill_halo_regions_periodic.jl @@ -15,19 +15,19 @@ end @inline fix_halo_offsets(o, co) = co > 0 ? o - co : o # Windowed fields have only positive offsets to correct -function fill_west_and_east_halo!(c, ::PBCT, ::PBCT, size, offset, loc, arch, grid, args...; kw...) +function fill_west_and_east_halo!(c, ::PBCT, ::PBCT, size, offset, loc, arch, grid, args...; only_local_halos = false, kw...) c_parent, yz_size, offset = parent_size_and_offset(c, 2, 3, size, offset) launch!(arch, grid, KernelParameters(yz_size, offset), fill_periodic_west_and_east_halo!, c_parent, Val(grid.Hx), grid.Nx; kw...) return nothing end -function fill_south_and_north_halo!(c, ::PBCT, ::PBCT, size, offset, loc, arch, grid, args...; kw...) +function fill_south_and_north_halo!(c, ::PBCT, ::PBCT, size, offset, loc, arch, grid, args...; only_local_halos = false, kw...) c_parent, xz_size, offset = parent_size_and_offset(c, 1, 3, size, offset) launch!(arch, grid, KernelParameters(xz_size, offset), fill_periodic_south_and_north_halo!, c_parent, Val(grid.Hy), grid.Ny; kw...) return nothing end -function fill_bottom_and_top_halo!(c, ::PBCT, ::PBCT, size, offset, loc, arch, grid, args...; kw...) +function fill_bottom_and_top_halo!(c, ::PBCT, ::PBCT, size, offset, loc, arch, grid, args...; only_local_halos = false, kw...) c_parent, xy_size, offset = parent_size_and_offset(c, 1, 2, size, offset) launch!(arch, grid, KernelParameters(xy_size, offset), fill_periodic_bottom_and_top_halo!, c_parent, Val(grid.Hz), grid.Nz; kw...) return nothing From 7b33e7a417b9ecaa54497221f35c651c549eabcf Mon Sep 17 00:00:00 2001 From: Simone Silvestri Date: Wed, 13 Nov 2024 13:52:05 +0100 Subject: [PATCH 16/17] bugfix --- src/Solvers/batched_tridiagonal_solver.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Solvers/batched_tridiagonal_solver.jl b/src/Solvers/batched_tridiagonal_solver.jl index 4e9aca12f3..83cb5b2236 100644 --- a/src/Solvers/batched_tridiagonal_solver.jl +++ b/src/Solvers/batched_tridiagonal_solver.jl @@ -1,5 +1,6 @@ using Oceananigans.Architectures: on_architecture using Oceananigans.Grids: XDirection, YDirection, ZDirection +using Oceananigans.ImmersedBoundaries: retrieve_surface_active_cells_map import Oceananigans.Architectures: architecture From 26194f320df8e03a02bc4bca0f03a7134a271ff0 Mon Sep 17 00:00:00 2001 From: Simone Silvestri Date: Wed, 13 Nov 2024 14:45:17 +0100 Subject: [PATCH 17/17] grid from solver --- src/Solvers/batched_tridiagonal_solver.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Solvers/batched_tridiagonal_solver.jl b/src/Solvers/batched_tridiagonal_solver.jl index 83cb5b2236..4f756ca3ac 100644 --- a/src/Solvers/batched_tridiagonal_solver.jl +++ b/src/Solvers/batched_tridiagonal_solver.jl @@ -108,7 +108,7 @@ function solve!(ϕ, solver::BatchedTridiagonalSolver, rhs, args...) active_cells_map = nothing elseif solver.tridiagonal_direction isa ZDirection launch_config = :xy - active_cells_map = retrieve_surface_active_cells_map(grid) + active_cells_map = retrieve_surface_active_cells_map(solver.grid) end launch!(architecture(solver), solver.grid, launch_config,