diff --git a/Project.toml b/Project.toml index 246af4bec54..5715725fd0d 100644 --- a/Project.toml +++ b/Project.toml @@ -4,6 +4,7 @@ authors = ["Michael Schlottke-Lakemper ", "Gregor version = "0.4.29-pre" [deps] +ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9" CodeTracking = "da1fd8a2-8d9e-5ec2-8556-3022fb5608a2" ConstructionBase = "187b0558-2788-49d3-abe0-74a17ed4e7c9" EllipsisNotation = "da5c29d0-fa7d-589e-88eb-ea29b0a81949" @@ -40,6 +41,7 @@ TriplotRecipes = "808ab39a-a642-4abf-81ff-4cb34ebbffa3" UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" [compat] +ArrayInterface = "3" CodeTracking = "1.0.5" ConstructionBase = "1.3" EllipsisNotation = "1.0" diff --git a/docs/src/reference-trixi.md b/docs/src/reference-trixi.md index 940051115d5..73ef8fe27e8 100644 --- a/docs/src/reference-trixi.md +++ b/docs/src/reference-trixi.md @@ -5,5 +5,5 @@ CurrentModule = Trixi ``` ```@autodocs -Modules = [Trixi] +Modules = [Trixi, Trixi.TrixiMPIArrays] ``` diff --git a/src/Trixi.jl b/src/Trixi.jl index 04a8ba707df..7f497805b43 100644 --- a/src/Trixi.jl +++ b/src/Trixi.jl @@ -25,6 +25,7 @@ using SparseArrays: AbstractSparseMatrix, AbstractSparseMatrixCSC, sparse, dropt # import @reexport now to make it available for further imports/exports using Reexport: @reexport +using ArrayInterface: static_length using SciMLBase: CallbackSet, DiscreteCallback, ODEProblem, ODESolution, ODEFunction import SciMLBase: get_du, get_tmp_cache, u_modified!, @@ -39,7 +40,6 @@ using HDF5: h5open, attributes using IfElse: ifelse using LinearMaps: LinearMap using LoopVectorization: LoopVectorization, @turbo, indices -using LoopVectorization.ArrayInterface: static_length using MPI: MPI using MuladdMacro: @muladd using GeometryBasics: GeometryBasics @@ -99,6 +99,7 @@ include("basic_types.jl") # Include all top-level source files include("auxiliary/auxiliary.jl") include("auxiliary/mpi.jl") +include("auxiliary/mpi_arrays.jl") include("auxiliary/p4est.jl") include("equations/equations.jl") include("meshes/meshes.jl") diff --git a/src/auxiliary/mpi_arrays.jl b/src/auxiliary/mpi_arrays.jl new file mode 100644 index 00000000000..97b9a0d99ba --- /dev/null +++ b/src/auxiliary/mpi_arrays.jl @@ -0,0 +1,241 @@ + +# TODO: MPI. We keep this module inside Trixi for now. When it stabilizes and +# turns out to be generally useful, we can consider moving it to a +# separate package with simple test suite and documentation. +module TrixiMPIArrays + +using ArrayInterface: ArrayInterface +using MPI: MPI + +using ..Trixi: Trixi, mpi_comm + +export TrixiMPIArray, ode_norm, ode_unstable_check + + +# Dispatch etc. +# The following functions have special dispatch behavior for `TrixiMPIArray`s. +# - `Trixi.wrap_array`: +# the wrapped array is wrapped again in a `TrixiMPIArray` +# - `Trixi.wrap_array_native`: +# should not be changed since it should return a plain `Array` +# - `Trixi.allocate_coefficients`: +# this handles the return type of initialization stuff when setting an IC +# with MPI +# Besides these, we usually dispatch on MPI mesh types such as +# `mesh::ParallelTreeMesh` or ``mesh::ParallelP4eestMesh` in Trixi.jl, since +# this is consistent with other dispatches on the mesh type. However, we +# dispatch on `u::TrixiMPIArray` whenever this allows simplifying some code, +# e.g., because we can call a basic function on `parent(u)` and add some MPI +# stuff on top. +""" + TrixiMPIArray{T, N} <: AbstractArray{T, N} + TrixiMPIArray(u::AbstractArray{T, N})::TrixiMPIArray{T, N} + +A thin wrapper of arrays distributed via MPI used in Trixi.jl. The idea is that +these arrays behave as much as possible as plain arrays would in an SPMD-style +distributed MPI setting with exception of reductions, which are performed +globally. This allows to use these arrays in ODE solvers such as the ones from +OrdinaryDiffEq.jl, since vector space operations, broadcasting, and reductions +are the only operations required for explicit time integration methods with +fixed step sizes or adaptive step sizes based on CFL or error estimates. + +!!! warning "Experimental code" + This code is experimental and may be changed or removed in any future release. +""" +struct TrixiMPIArray{T, N, Parent<:AbstractArray{T, N}} <: AbstractArray{T, N} + u_local::Parent + mpi_comm::MPI.Comm +end + +function TrixiMPIArray(u_local::AbstractArray{T, N}) where {T, N} + # TODO: MPI. Hard-coded to MPI.COMM_WORLD for now + mpi_comm = MPI.COMM_WORLD + TrixiMPIArray{T, N, typeof(u_local)}(u_local, mpi_comm) +end + + +# `Base.show` with additional helpful information +function Base.show(io::IO, u::TrixiMPIArray) + print(io, "TrixiMPIArray wrapping ", parent(u)) +end + +function Base.show(io::IO, mime::MIME"text/plain", u::TrixiMPIArray) + print(io, "TrixiMPIArray wrapping ") + show(io, mime, parent(u)) +end + + +# Custom interface and general Base interface not covered by other parts below +Base.parent(u::TrixiMPIArray) = u.u_local +Base.resize!(u::TrixiMPIArray, new_size) = resize!(parent(u), new_size) +function Base.copy(u::TrixiMPIArray) + return TrixiMPIArray(copy(parent(u)), mpi_comm(u)) +end + +Trixi.mpi_comm(u::TrixiMPIArray) = u.mpi_comm + + +# Implementation of the abstract array interface of Base +# See https://docs.julialang.org/en/v1/manual/interfaces/#man-interface-array +Base.size(u::TrixiMPIArray) = size(parent(u)) +Base.getindex(u::TrixiMPIArray, idx) = getindex(parent(u), idx) +Base.setindex!(u::TrixiMPIArray, v, idx) = setindex!(parent(u), v, idx) +Base.IndexStyle(::Type{TrixiMPIArray{T, N, Parent}}) where {T, N, Parent} = IndexStyle(Parent) +function Base.similar(u::TrixiMPIArray, ::Type{S}, dims::NTuple{N, Int}) where {S, N} + return TrixiMPIArray(similar(parent(u), S, dims), mpi_comm(u)) +end +Base.axes(u::TrixiMPIArray) = axes(parent(u)) + + +# Implementation of the strided array interface of Base +# See https://docs.julialang.org/en/v1/manual/interfaces/#man-interface-strided-arrays +Base.strides(u::TrixiMPIArray) = strides(parent(u)) +function Base.unsafe_convert(::Type{Ptr{T}}, u::TrixiMPIArray{T}) where {T} + return Base.unsafe_convert(Ptr{T}, parent(u)) +end +Base.elsize(::Type{TrixiMPIArray{T, N, Parent}}) where {T, N, Parent} = Base.elsize(Parent) + + +# We need to customize broadcasting since broadcasting expressions allocating +# an output would return plain `Array`s otherwise, losing the MPI information. +# Such allocating broadcasting calls are used for example when determining the +# initial step size in OrdinaryDiffEq.jl. +# However, everything else appears to be fine, i.e., all broadcasting calls +# with a given output storage location work fine. In particular, threaded +# broadcasting with FastBroadcast.jl works fine, e.g., when using threaded RK +# methods such as `SSPRK43(thread=OrdinaryDiffEq.True())`. +# See also +# https://github.com/YingboMa/FastBroadcast.jl +# https://docs.julialang.org/en/v1/manual/interfaces/#man-interfaces-broadcasting +Base.BroadcastStyle(::Type{<:TrixiMPIArray}) = Broadcast.ArrayStyle{TrixiMPIArray}() + +function Base.similar(bc::Broadcast.Broadcasted{Broadcast.ArrayStyle{TrixiMPIArray}}, ::Type{ElType}) where ElType + # Scan the inputs for the first TrixiMPIArray and use that to create a `similar` + # output array with MPI information + # See also https://docs.julialang.org/en/v1/manual/interfaces/#Selecting-an-appropriate-output-array + A = find_mpi_array(bc) + similar(A, axes(bc)) +end +# `A = find_mpi_array(As)` returns the first TrixiMPIArray among the arguments +find_mpi_array(bc::Base.Broadcast.Broadcasted) = find_mpi_array(bc.args) +find_mpi_array(args::Tuple) = find_mpi_array(find_mpi_array(args[1]), Base.tail(args)) +find_mpi_array(x) = x +find_mpi_array(::Tuple{}) = nothing +find_mpi_array(a::TrixiMPIArray, rest) = a +find_mpi_array(::Any, rest) = find_mpi_array(rest) + + +# Implementation of methods from ArrayInterface.jl for use with +# LoopVectorization.jl etc. +# See https://juliaarrays.github.io/ArrayInterface.jl/stable/ +ArrayInterface.parent_type(::Type{TrixiMPIArray{T, N, Parent}}) where {T, N, Parent} = Parent + + +# TODO: MPI. For now, we do not implement specializations of LinearAlgebra +# functions such as `norm` or `dot`. We might revisit this again +# in the future. + + +# `mapreduce` functionality from Base using global reductions via MPI communication +# for use in, e.g., error-based step size control in OrdinaryDiffEq.jl +# OBS! This makes reductions a global, blocking operation that will stall unless +# executed on all MPI ranks simultaneously +function Base.mapreduce(f::F, op::Op, u::TrixiMPIArray; kwargs...) where {F, Op} + local_value = mapreduce(f, op, parent(u); kwargs...) + return MPI.Allreduce(local_value, op, mpi_comm(u)) +end + + +# Default settings of OrdinaryDiffEq etc. +# Interesting options could be +# - ODE_DEFAULT_UNSTABLE_CHECK +# - ODE_DEFAULT_ISOUTOFDOMAIN (disabled by default) +# - ODE_DEFAULT_NORM +# See https://github.com/SciML/DiffEqBase.jl/blob/master/src/common_defaults.jl +# +# Problems and inconsistencies with possible global `length`s of TrixiMPIArrays +# +# A basic question is how to handle `length`. We want `TrixiMPIArray`s to behave +# like regular `Array`s in most code, e.g., when looping over an array (which +# should use `eachindex`). At the same time, we want to be able to use adaptive +# time stepping using error estimates in OrdinaryDiffEq.jl. There, the default +# norm `ODE_DEFAULT_NORM` is the one described in the book of Hairer & Wanner, +# i.e., it includes a division by the global `length` of the array. We could +# specialize `ODE_DEFAULT_NORM` accordingly, but that requires depending on +# DiffEqBase.jl (instead of SciMLBase.jl). Alternatively, we could implement +# this via Requires.jl, but that will prevent precompilation and maybe trigger +# invalidations. Alternatively, could implement our own norm and pass that as +# `internalnorm`. We could also try to use the least intrusive approach for now +# and specialize `length` to return a global length while making sure that all +# local behavior is still working as expected (if using `eachindex` instead of +# `1:length` etc.). This means that we would have +# `eachindex(u) != Base.OneTo(length(u))` for `u::TrixiMPIArray` in general, +# even if `u` and its underlying array use one-based indexing. +# Some consequences are that we need to implement specializations of `show`, +# since the default ones call `length`. However, this doesn't work if not all +# ranks call the same method, e.g., when showing an array only on one rank. +# Moreover, we would need to specialize `copyto!` and probably many other +# functions. Since this can lead to hard-to-find bugs and problems in MPI code, +# we use a more verbose approach. Thus, we let `length` be a local `length` and +# provide a new function `ode_norm` to be passed as `internalnorm` of +# OrdinaryDiffEq's `solve` function. + + +# Specialization of `view`. Without these, `view`s of arrays returned by +# `wrap_array` with multiple conserved variables do not always work... +# This may also be related to the use of a global `length`? +Base.view(u::TrixiMPIArray, idx::Vararg{Any,N}) where {N} = view(parent(u), idx...) + + +""" + ode_norm(u, t) + +Implementation of the weighted L2 norm of Hairer and Wanner used for error-based +step size control in OrdinaryDiffEq.jl. This function is aware of +[`TrixiMPIArray`](@ref)s, handling them appropriately with global MPI +communication. + +You must pass this function as keyword argument +`internalnorm=ode_norm` +of `solve` when using error-based step size control with MPI parallel execution +of Trixi.jl. + +See [`https://diffeq.sciml.ai/stable/basics/common_solver_opts/#advanced_adaptive_stepsize_control`](https://diffeq.sciml.ai/stable/basics/common_solver_opts/#advanced_adaptive_stepsize_control) + +!!! warning "Experimental code" + This code is experimental and may be changed or removed in any future release. +""" +ode_norm(u, t) = @fastmath abs(u) +ode_norm(u::AbstractArray, t) = sqrt(sum(abs2, u) / length(u)) +function ode_norm(u::TrixiMPIArray, t) + local_sumabs2 = sum(abs2, parent(u)) + local_length = length(parent(u)) + global_sumabs2, global_length = MPI.Allreduce([local_sumabs2, local_length], +, mpi_comm(u)) + return sqrt(global_sumabs2 / global_length) +end + + +""" + ode_unstable_check(dt, u, semi, t) + +Implementation of the basic check for instability used in OrdinaryDiffEq.jl. +Instead of checking something like `any(isnan, u)`, this function just checks +`isnan(dt)`. This helps when using [`TrixiMPIArray`](@ref)s, since no additional +global communication is required and all ranks will return the same result. + +You should pass this function as keyword argument +`unstable_check=ode_unstable_check` +of `solve` when using error-based step size control with MPI parallel execution +of Trixi.jl. + +See [`https://diffeq.sciml.ai/stable/basics/common_solver_opts/#Miscellaneous`](https://diffeq.sciml.ai/stable/basics/common_solver_opts/#Miscellaneous) + +!!! warning "Experimental code" + This code is experimental and may be changed or removed in any future release. +""" +ode_unstable_check(dt, u, semi, t) = isnan(dt) + + +end # module + +@reexport using .TrixiMPIArrays: TrixiMPIArrays, TrixiMPIArray, ode_norm, ode_unstable_check diff --git a/src/callbacks_step/amr.jl b/src/callbacks_step/amr.jl index 6a1a73c838f..17445bb5fe0 100644 --- a/src/callbacks_step/amr.jl +++ b/src/callbacks_step/amr.jl @@ -221,7 +221,7 @@ function (amr_callback::AMRCallback)(u_ode::AbstractVector, mesh::TreeMesh, @unpack to_refine, to_coarsen = amr_callback.amr_cache empty!(to_refine) empty!(to_coarsen) - for element in 1:length(lambda) + for element in eachindex(lambda) controller_value = lambda[element] if controller_value > 0 push!(to_refine, leaf_cell_ids[element]) @@ -282,7 +282,7 @@ function (amr_callback::AMRCallback)(u_ode::AbstractVector, mesh::TreeMesh, end # Extract only those parent cells for which all children should be coarsened - to_coarsen = collect(1:length(parents_to_coarsen))[parents_to_coarsen .== 2^ndims(mesh)] + to_coarsen = collect(eachindex(parents_to_coarsen))[parents_to_coarsen .== 2^ndims(mesh)] # Finally, coarsen mesh coarsened_original_cells = @trixi_timeit timer() "mesh" coarsen!(mesh.tree, to_coarsen) diff --git a/src/callbacks_step/analysis_dg2d_parallel.jl b/src/callbacks_step/analysis_dg2d_parallel.jl index 499bcad6328..a80743d7fe4 100644 --- a/src/callbacks_step/analysis_dg2d_parallel.jl +++ b/src/callbacks_step/analysis_dg2d_parallel.jl @@ -136,17 +136,18 @@ function calc_error_norms(func, u, t, analyzer, end -function integrate_via_indices(func::Func, u, - mesh::ParallelTreeMesh{2}, equations, dg::DGSEM, cache, +# We need to dispatch on `u::TrixiMPIArray` instead of `mesh::TreeMesh{2}` to +# simply use `parent(u)` instead of some `invoke` call. +function integrate_via_indices(func::Func, u::TrixiMPIArray, + mesh::TreeMesh{2}, equations, dg::DGSEM, cache, args...; normalize=true) where {Func} - # call the method accepting a general `mesh::TreeMesh{2}` - # TODO: MPI, we should improve this; maybe we should dispatch on `u` - # and create some MPI array type, overloading broadcasting and mapreduce etc. - # Then, this specific array type should also work well with DiffEq etc. - local_integral = invoke(integrate_via_indices, - Tuple{typeof(func), typeof(u), TreeMesh{2}, typeof(equations), - typeof(dg), typeof(cache), map(typeof, args)...}, - func, u, mesh, equations, dg, cache, args..., normalize=normalize) + # Call the method for the local degrees of freedom and perform a global + # MPI reduction afterwards. + # Note that the simple `TreeMesh` implements an efficient way to compute + # the global volume without requiring communication. This global volume is + # already used when `normalize=true`. + local_integral = integrate_via_indices(func, parent(u), mesh, equations, dg, + cache, args...; normalize) # OBS! Global results are only calculated on MPI root, all other domains receive `nothing` global_integral = MPI.Reduce!(Ref(local_integral), +, mpi_root(), mpi_comm()) diff --git a/src/callbacks_step/save_restart_dg.jl b/src/callbacks_step/save_restart_dg.jl index 825ea3bcd5b..6a7e6e28cbd 100644 --- a/src/callbacks_step/save_restart_dg.jl +++ b/src/callbacks_step/save_restart_dg.jl @@ -84,7 +84,9 @@ function load_restart_file(mesh::Union{SerialTreeMesh, StructuredMesh, Unstructu return u_ode end - +# Note that we cannot dispatch on `u::TrixiMPIArray` since we use +# `wrap_array_native` before calling this method, loosing the MPI array +# wrapper type. function save_restart_file(u, time, dt, timestep, mesh::Union{ParallelTreeMesh, ParallelP4estMesh}, equations, dg::DG, cache, restart_callback) diff --git a/src/callbacks_step/save_solution_dg.jl b/src/callbacks_step/save_solution_dg.jl index c1708ca6820..43299454714 100644 --- a/src/callbacks_step/save_solution_dg.jl +++ b/src/callbacks_step/save_solution_dg.jl @@ -73,7 +73,9 @@ function save_solution_file(u, time, dt, timestep, return filename end - +# Note that we cannot dispatch on `u::TrixiMPIArray` since we use +# `wrap_array_native` before calling this method, loosing the MPI array +# wrapper type. function save_solution_file(u, time, dt, timestep, mesh::Union{ParallelTreeMesh, ParallelP4estMesh}, equations, dg::DG, cache, solution_callback, element_variables=Dict{Symbol,Any}(); diff --git a/src/callbacks_step/stepsize_dg2d.jl b/src/callbacks_step/stepsize_dg2d.jl index 821e4e744fe..6dd53f09ae1 100644 --- a/src/callbacks_step/stepsize_dg2d.jl +++ b/src/callbacks_step/stepsize_dg2d.jl @@ -26,7 +26,6 @@ function max_dt(u, t, mesh::TreeMesh{2}, return 2 / (nnodes(dg) * max_scaled_speed) end - function max_dt(u, t, mesh::TreeMesh{2}, constant_speed::Val{true}, equations, dg::DG, cache) # to avoid a division by zero if the speed vanishes everywhere, @@ -43,32 +42,21 @@ function max_dt(u, t, mesh::TreeMesh{2}, end -function max_dt(u, t, mesh::ParallelTreeMesh{2}, +function max_dt(u::TrixiMPIArray, t, mesh::TreeMesh{2}, constant_speed::Val{false}, equations, dg::DG, cache) - # call the method accepting a general `mesh::TreeMesh{2}` - # TODO: MPI, we should improve this; maybe we should dispatch on `u` - # and create some MPI array type, overloading broadcasting and mapreduce etc. - # Then, this specific array type should also work well with DiffEq etc. - dt = invoke(max_dt, - Tuple{typeof(u), typeof(t), TreeMesh{2}, - typeof(constant_speed), typeof(equations), typeof(dg), typeof(cache)}, - u, t, mesh, constant_speed, equations, dg, cache) + # call the method for the local degrees of freedom and perform a global + # MPI reduction afterwards + dt = max_dt(parent(u), t, mesh, constant_speed, equations, dg, cache) dt = MPI.Allreduce!(Ref(dt), min, mpi_comm())[] return dt end - -function max_dt(u, t, mesh::ParallelTreeMesh{2}, +function max_dt(u::TrixiMPIArray, t, mesh::TreeMesh{2}, constant_speed::Val{true}, equations, dg::DG, cache) - # call the method accepting a general `mesh::TreeMesh{2}` - # TODO: MPI, we should improve this; maybe we should dispatch on `u` - # and create some MPI array type, overloading broadcasting and mapreduce etc. - # Then, this specific array type should also work well with DiffEq etc. - dt = invoke(max_dt, - Tuple{typeof(u), typeof(t), TreeMesh{2}, - typeof(constant_speed), typeof(equations), typeof(dg), typeof(cache)}, - u, t, mesh, constant_speed, equations, dg, cache) + # call the method for the local degrees of freedom and perform a global + # MPI reduction afterwards + dt = max_dt(parent(u), t, mesh, constant_speed, equations, dg, cache) dt = MPI.Allreduce!(Ref(dt), min, mpi_comm())[] return dt @@ -107,7 +95,6 @@ function max_dt(u, t, mesh::Union{StructuredMesh{2}, UnstructuredMesh2D, P4estMe return 2 / (nnodes(dg) * max_scaled_speed) end - function max_dt(u, t, mesh::Union{StructuredMesh{2}, UnstructuredMesh2D, P4estMesh{2}}, constant_speed::Val{true}, equations, dg::DG, cache) @unpack contravariant_vectors, inverse_jacobian = cache.elements @@ -135,32 +122,21 @@ function max_dt(u, t, mesh::Union{StructuredMesh{2}, UnstructuredMesh2D, P4estMe end -function max_dt(u, t, mesh::ParallelP4estMesh{2}, +function max_dt(u::TrixiMPIArray, t, mesh::P4estMesh{2}, constant_speed::Val{false}, equations, dg::DG, cache) - # call the method accepting a general `mesh::P4estMesh{2}` - # TODO: MPI, we should improve this; maybe we should dispatch on `u` - # and create some MPI array type, overloading broadcasting and mapreduce etc. - # Then, this specific array type should also work well with DiffEq etc. - dt = invoke(max_dt, - Tuple{typeof(u), typeof(t), P4estMesh{2}, - typeof(constant_speed), typeof(equations), typeof(dg), typeof(cache)}, - u, t, mesh, constant_speed, equations, dg, cache) + # call the method for the local degrees of freedom and perform a global + # MPI reduction afterwards + dt = max_dt(parent(u), t, mesh, constant_speed, equations, dg, cache) dt = MPI.Allreduce!(Ref(dt), min, mpi_comm())[] return dt end - -function max_dt(u, t, mesh::ParallelP4estMesh{2}, +function max_dt(u::TrixiMPIArray, t, mesh::P4estMesh{2}, constant_speed::Val{true}, equations, dg::DG, cache) - # call the method accepting a general `mesh::P4estMesh{2}` - # TODO: MPI, we should improve this; maybe we should dispatch on `u` - # and create some MPI array type, overloading broadcasting and mapreduce etc. - # Then, this specific array type should also work well with DiffEq etc. - dt = invoke(max_dt, - Tuple{typeof(u), typeof(t), P4estMesh{2}, - typeof(constant_speed), typeof(equations), typeof(dg), typeof(cache)}, - u, t, mesh, constant_speed, equations, dg, cache) + # call the method for the local degrees of freedom and perform a global + # MPI reduction afterwards + dt = max_dt(parent(u), t, mesh, constant_speed, equations, dg, cache) dt = MPI.Allreduce!(Ref(dt), min, mpi_comm())[] return dt diff --git a/src/callbacks_step/stepsize_dg3d.jl b/src/callbacks_step/stepsize_dg3d.jl index a7a12372396..c424ebb5ba7 100644 --- a/src/callbacks_step/stepsize_dg3d.jl +++ b/src/callbacks_step/stepsize_dg3d.jl @@ -109,32 +109,22 @@ function max_dt(u, t, mesh::Union{StructuredMesh{3}, P4estMesh{3}}, end -function max_dt(u, t, mesh::ParallelP4estMesh{3}, +function max_dt(u::TrixiMPIArray, t, mesh::P4estMesh{3}, constant_speed::Val{false}, equations, dg::DG, cache) - # call the method accepting a general `mesh::P4estMesh{3}` - # TODO: MPI, we should improve this; maybe we should dispatch on `u` - # and create some MPI array type, overloading broadcasting and mapreduce etc. - # Then, this specific array type should also work well with DiffEq etc. - dt = invoke(max_dt, - Tuple{typeof(u), typeof(t), P4estMesh{3}, - typeof(constant_speed), typeof(equations), typeof(dg), typeof(cache)}, - u, t, mesh, constant_speed, equations, dg, cache) + # call the method for the local degrees of freedom and perform a global + # MPI reduction afterwards + dt = max_dt(parent(u), t, mesh, constant_speed, equations, dg, cache) dt = MPI.Allreduce!(Ref(dt), min, mpi_comm())[] return dt end -function max_dt(u, t, mesh::ParallelP4estMesh{3}, +function max_dt(u::TrixiMPIArray, t, mesh::P4estMesh{3}, constant_speed::Val{true}, equations, dg::DG, cache) - # call the method accepting a general `mesh::P4estMesh{3}` - # TODO: MPI, we should improve this; maybe we should dispatch on `u` - # and create some MPI array type, overloading broadcasting and mapreduce etc. - # Then, this specific array type should also work well with DiffEq etc. - dt = invoke(max_dt, - Tuple{typeof(u), typeof(t), P4estMesh{3}, - typeof(constant_speed), typeof(equations), typeof(dg), typeof(cache)}, - u, t, mesh, constant_speed, equations, dg, cache) + # call the method for the local degrees of freedom and perform a global + # MPI reduction afterwards + dt = max_dt(parent(u), t, mesh, constant_speed, equations, dg, cache) dt = MPI.Allreduce!(Ref(dt), min, mpi_comm())[] return dt diff --git a/src/callbacks_step/time_series_dg2d.jl b/src/callbacks_step/time_series_dg2d.jl index 778739a824b..a5b097266fd 100644 --- a/src/callbacks_step/time_series_dg2d.jl +++ b/src/callbacks_step/time_series_dg2d.jl @@ -137,7 +137,7 @@ function record_state_at_points!(point_data, u, solution_variables, n_solution_v for j in eachnode(dg), i in eachnode(dg) u_node = solution_variables(get_node_vars(u, equations, dg, i, j, element_id), equations) - for v in 1:length(u_node) + for v in eachindex(u_node) data[old_length + v] += (u_node[v] * interpolating_polynomials[i, 1, index] * interpolating_polynomials[j, 2, index]) diff --git a/src/meshes/meshes.jl b/src/meshes/meshes.jl index a6dcbe132d8..3713d9c8935 100644 --- a/src/meshes/meshes.jl +++ b/src/meshes/meshes.jl @@ -4,6 +4,8 @@ # See https://ranocha.de/blog/Optimizing_EC_Trixi for further details. @muladd begin +# Default value - meshes supporting parallelization via MPI must opt-in +mpi_parallel(mesh::AbstractMesh) = Val{false}() include("tree_mesh.jl") include("structured_mesh.jl") diff --git a/src/solvers/dg.jl b/src/solvers/dg.jl index 86cbac1ee68..141be1d5fdc 100644 --- a/src/solvers/dg.jl +++ b/src/solvers/dg.jl @@ -369,9 +369,20 @@ include("dgsem/dgsem.jl") function allocate_coefficients(mesh::AbstractMesh, equations, dg::DG, cache) - # We must allocate a `Vector` in order to be able to `resize!` it (AMR). + # We must allocate a `Vector` in order to be able to `resize!` it for AMR. + # Currently, Julia does not allow resizing multi-dimensional `Array`s, see + # https://github.com/JuliaLang/julia/issues/37900 # cf. wrap_array - zeros(eltype(cache.elements), nvariables(equations) * nnodes(dg)^ndims(mesh) * nelements(dg, cache)) + u = zeros(eltype(cache.elements), nvariables(equations) * nnodes(dg)^ndims(mesh) * nelements(dg, cache)) + + # This function is used every time we allocate some new coefficient vector. + # Thus, we can check here whether MPI parallelization is set up and wrap + # the coefficient vector for use with MPI if necessary. + if mpi_parallel(mesh) === Val{true}() + return TrixiMPIArray(u) + else + return u + end end @inline function wrap_array(u_ode::AbstractVector, mesh::AbstractMesh, equations, dg::DGSEM, cache) @@ -405,13 +416,19 @@ end # `@batch` from Polyester.jl or something similar. Using Polyester.jl # is probably the best option since everything will be handed over to # Chris Elrod, one of the best performance software engineers for Julia. - PtrArray(pointer(u_ode), - (StaticInt(nvariables(equations)), ntuple(_ -> StaticInt(nnodes(dg)), ndims(mesh))..., nelements(dg, cache))) - # (nvariables(equations), ntuple(_ -> nnodes(dg), ndims(mesh))..., nelements(dg, cache))) + u = PtrArray(pointer(u_ode), + (StaticInt(nvariables(equations)), ntuple(_ -> StaticInt(nnodes(dg)), ndims(mesh))..., nelements(dg, cache))) + # (nvariables(equations), ntuple(_ -> nnodes(dg), ndims(mesh))..., nelements(dg, cache))) else # The following version is reasonably fast and allows us to `resize!(u_ode, ...)`. - unsafe_wrap(Array{eltype(u_ode), ndims(mesh)+2}, pointer(u_ode), - (nvariables(equations), ntuple(_ -> nnodes(dg), ndims(mesh))..., nelements(dg, cache))) + u = unsafe_wrap(Array{eltype(u_ode), ndims(mesh)+2}, pointer(u_ode), + (nvariables(equations), ntuple(_ -> nnodes(dg), ndims(mesh))..., nelements(dg, cache))) + end + + if u_ode isa TrixiMPIArray + return TrixiMPIArray(u, mpi_comm(u_ode)) + else + return u end end diff --git a/src/solvers/dgsem_structured/dg_2d_compressible_euler.jl b/src/solvers/dgsem_structured/dg_2d_compressible_euler.jl index 4794aad8cb3..a7493b4fe2b 100644 --- a/src/solvers/dgsem_structured/dg_2d_compressible_euler.jl +++ b/src/solvers/dgsem_structured/dg_2d_compressible_euler.jl @@ -18,13 +18,14 @@ # We specialize on `PtrArray` since these will be returned by `Trixi.wrap_array` # if LoopVectorization.jl can handle the array types. This ensures that `@turbo` # works efficiently here. -@inline function split_form_kernel!(_du::PtrArray, u_cons::PtrArray, +@inline function split_form_kernel!(_du::Union{PtrArray, TrixiMPIArray{T, N, <:PtrArray}}, + u_cons::Union{PtrArray, TrixiMPIArray{T, N, <:PtrArray}}, element, mesh::Union{StructuredMesh{2}, UnstructuredMesh2D, P4estMesh{2}}, nonconservative_terms::Val{false}, equations::CompressibleEulerEquations2D, volume_flux::typeof(flux_shima_etal_turbo), - dg::DGSEM, cache, alpha) + dg::DGSEM, cache, alpha) where {T, N} @unpack derivative_split = dg.basis @unpack contravariant_vectors = cache.elements @@ -220,13 +221,14 @@ end -@inline function split_form_kernel!(_du::PtrArray, u_cons::PtrArray, +@inline function split_form_kernel!(_du::Union{PtrArray, TrixiMPIArray{T, N, <:PtrArray}}, + u_cons::Union{PtrArray, TrixiMPIArray{T, N, <:PtrArray}}, element, mesh::Union{StructuredMesh{2}, UnstructuredMesh2D, P4estMesh{2}}, nonconservative_terms::Val{false}, equations::CompressibleEulerEquations2D, volume_flux::typeof(flux_ranocha_turbo), - dg::DGSEM, cache, alpha) + dg::DGSEM, cache, alpha) where {T, N} @unpack derivative_split = dg.basis @unpack contravariant_vectors = cache.elements diff --git a/src/solvers/dgsem_structured/dg_3d_compressible_euler.jl b/src/solvers/dgsem_structured/dg_3d_compressible_euler.jl index 1b3180076f9..89a4f5e9390 100644 --- a/src/solvers/dgsem_structured/dg_3d_compressible_euler.jl +++ b/src/solvers/dgsem_structured/dg_3d_compressible_euler.jl @@ -18,12 +18,13 @@ # We specialize on `PtrArray` since these will be returned by `Trixi.wrap_array` # if LoopVectorization.jl can handle the array types. This ensures that `@turbo` # works efficiently here. -@inline function split_form_kernel!(_du::PtrArray, u_cons::PtrArray, +@inline function split_form_kernel!(_du::Union{PtrArray, TrixiMPIArray{T, N, <:PtrArray}}, + u_cons::Union{PtrArray, TrixiMPIArray{T, N, <:PtrArray}}, element, mesh::Union{StructuredMesh{3}, P4estMesh{3}}, nonconservative_terms::Val{false}, equations::CompressibleEulerEquations3D, volume_flux::typeof(flux_shima_etal_turbo), - dg::DGSEM, cache, alpha) + dg::DGSEM, cache, alpha) where {T, N} @unpack derivative_split = dg.basis @unpack contravariant_vectors = cache.elements diff --git a/src/solvers/dgsem_tree/dg_2d_compressible_euler.jl b/src/solvers/dgsem_tree/dg_2d_compressible_euler.jl index faaed4fe68a..1d2970e90cb 100644 --- a/src/solvers/dgsem_tree/dg_2d_compressible_euler.jl +++ b/src/solvers/dgsem_tree/dg_2d_compressible_euler.jl @@ -66,12 +66,13 @@ end # muladd # We specialize on `PtrArray` since these will be returned by `Trixi.wrap_array` # if LoopVectorization.jl can handle the array types. This ensures that `@turbo` # works efficiently here. -@inline function split_form_kernel!(_du::PtrArray, u_cons::PtrArray, +@inline function split_form_kernel!(_du::Union{PtrArray, TrixiMPIArray{T, N, <:PtrArray}}, + u_cons::Union{PtrArray, TrixiMPIArray{T, N, <:PtrArray}}, element, mesh::TreeMesh{2}, nonconservative_terms::Val{false}, equations::CompressibleEulerEquations2D, volume_flux::typeof(flux_shima_etal_turbo), - dg::DGSEM, cache, alpha) + dg::DGSEM, cache, alpha) where {T, N} @unpack derivative_split = dg.basis # Create a temporary array that will be used to store the RHS with permuted @@ -228,12 +229,13 @@ end -@inline function split_form_kernel!(_du::PtrArray, u_cons::PtrArray, +@inline function split_form_kernel!(_du::Union{PtrArray, TrixiMPIArray{T, N, <:PtrArray}}, + u_cons::Union{PtrArray, TrixiMPIArray{T, N, <:PtrArray}}, element, mesh::TreeMesh{2}, nonconservative_terms::Val{false}, equations::CompressibleEulerEquations2D, volume_flux::typeof(flux_ranocha_turbo), - dg::DGSEM, cache, alpha) + dg::DGSEM, cache, alpha) where {T, N} @unpack derivative_split = dg.basis # Create a temporary array that will be used to store the RHS with permuted diff --git a/src/solvers/dgsem_tree/dg_3d_compressible_euler.jl b/src/solvers/dgsem_tree/dg_3d_compressible_euler.jl index f78d0cea366..0b4f1fc9d44 100644 --- a/src/solvers/dgsem_tree/dg_3d_compressible_euler.jl +++ b/src/solvers/dgsem_tree/dg_3d_compressible_euler.jl @@ -17,12 +17,13 @@ # We specialize on `PtrArray` since these will be returned by `Trixi.wrap_array` # if LoopVectorization.jl can handle the array types. This ensures that `@turbo` # works efficiently here. -@inline function split_form_kernel!(_du::PtrArray, u_cons::PtrArray, +@inline function split_form_kernel!(_du::Union{PtrArray, TrixiMPIArray{T, N, <:PtrArray}}, + u_cons::Union{PtrArray, TrixiMPIArray{T, N, <:PtrArray}}, element, mesh::TreeMesh{3}, nonconservative_terms::Val{false}, equations::CompressibleEulerEquations3D, volume_flux::typeof(flux_shima_etal_turbo), - dg::DGSEM, cache, alpha) + dg::DGSEM, cache, alpha) where {T, N} @unpack derivative_split = dg.basis # Create a temporary array that will be used to store the RHS with permuted @@ -262,12 +263,13 @@ end -@inline function split_form_kernel!(_du::PtrArray, u_cons::PtrArray, +@inline function split_form_kernel!(_du::Union{PtrArray, TrixiMPIArray{T, N, <:PtrArray}}, + u_cons::Union{PtrArray, TrixiMPIArray{T, N, <:PtrArray}}, element, mesh::TreeMesh{3}, nonconservative_terms::Val{false}, equations::CompressibleEulerEquations3D, volume_flux::typeof(flux_ranocha_turbo), - dg::DGSEM, cache, alpha) + dg::DGSEM, cache, alpha) where {T, N} @unpack derivative_split = dg.basis # Create a temporary array that will be used to store the RHS with permuted diff --git a/test/test_mpi.jl b/test/test_mpi.jl index 4a210f1ee6b..77d0c656c4b 100644 --- a/test/test_mpi.jl +++ b/test/test_mpi.jl @@ -16,6 +16,9 @@ Trixi.mpi_isroot() && isdir(outdir) && rm(outdir, recursive=true) CI_ON_WINDOWS = (get(ENV, "GITHUB_ACTIONS", false) == "true") && Sys.iswindows() @testset "MPI" begin + # TrixiMPIArray tests + include("test_mpi_arrays.jl") + # TreeMesh tests include("test_mpi_tree.jl") diff --git a/test/test_mpi_arrays.jl b/test/test_mpi_arrays.jl new file mode 100644 index 00000000000..b40e5f2af68 --- /dev/null +++ b/test/test_mpi_arrays.jl @@ -0,0 +1,69 @@ +module TestTrixiMPIArrays + +using Test +using LinearAlgebra: norm +using Trixi: TrixiMPIArray, mpi_isroot, mpi_nranks, ode_norm, ode_unstable_check + + +@testset "TestTrixiMPIArrays" begin + +@testset "show" begin + u_parent = zeros(5) + u_mpi = TrixiMPIArray(u_parent) + + # output on all ranks + @test_nowarn show(stdout, u_mpi) + @test_nowarn show(stdout, MIME"text/plain"(), u_mpi) + + # test whether show works also on a single rank + if mpi_isroot() + @test_nowarn show(stdout, u_mpi) + @test_nowarn show(stdout, MIME"text/plain"(), u_mpi) + end +end + + +@testset "vector interface" begin + u_parent = ones(5) + u_mpi = TrixiMPIArray(u_parent) + + @test sum(u_mpi) ≈ 5 * mpi_nranks() + + @test_nowarn resize!(u_mpi, 4) + @test sum(u_mpi) ≈ 4 * mpi_nranks() + @test sum(u_parent) == 4 + + u_mpi2 = copy(u_mpi) + @test sum(u_mpi2) ≈ sum(u_mpi) + + res = @. u_mpi / u_mpi + @test res isa TrixiMPIArray + + res = @. u_mpi / u_parent + @test res isa TrixiMPIArray + + res = @. 5 * u_mpi + @test res isa TrixiMPIArray + + @test strides(u_mpi) == strides(u_parent) + @test Base.elsize(u_mpi) == sizeof(Float64) +end + + +@testset "ODE interface" begin + u_parent = [1.0, -2.0, 3.5, -4.0] + u_mpi = TrixiMPIArray(u_parent) + + # duplicating a vector doesn't change the norm weighted by the global length + @test ode_norm(u_parent, 0.0) ≈ norm(u_parent) / sqrt(length(u_parent)) + @test ode_norm(u_mpi, 0.0) ≈ ode_norm(u_parent, 0.0) + + u_parent[1] = NaN + @test ode_unstable_check(rand(), u_mpi, nothing, 0.0) == false + @test ode_unstable_check(NaN, u_mpi, nothing, 0.0) == true +end + + +end # TestTrixiMPIArrays + +end # module diff --git a/test/test_mpi_p4est_2d.jl b/test/test_mpi_p4est_2d.jl index 325767da068..9e6f1ebbeea 100644 --- a/test/test_mpi_p4est_2d.jl +++ b/test/test_mpi_p4est_2d.jl @@ -18,6 +18,21 @@ const EXAMPLES_DIR = joinpath(pathof(Trixi) |> dirname |> dirname, "examples", " # Expected errors are exactly the same as with TreeMesh! l2 = [8.311947673061856e-6], linf = [6.627000273229378e-5]) + + @testset "error-based step size control" begin + Trixi.mpi_isroot() && println("-"^100) + Trixi.mpi_isroot() && println("elixir_advection_basic.jl with error-based step size control") + + sol = solve(ode, RDPK3SpFSAL35(), abstol=1.0e-4, reltol=1.0e-4, + save_everystep=false, callback=callbacks, + internalnorm=ode_norm, + unstable_check=ode_unstable_check); summary_callback() + errors = analysis_callback(sol) + if Trixi.mpi_isroot() + @test errors.l2 ≈ [3.3022040342579066e-5] rtol=1.0e-4 + @test errors.linf ≈ [0.00011787417954578494] rtol=1.0e-4 + end + end end @trixi_testset "elixir_advection_nonconforming_flag.jl" begin diff --git a/test/test_mpi_p4est_3d.jl b/test/test_mpi_p4est_3d.jl index 3488e1bfbd0..f7eb1574ccf 100644 --- a/test/test_mpi_p4est_3d.jl +++ b/test/test_mpi_p4est_3d.jl @@ -18,6 +18,21 @@ const EXAMPLES_DIR = joinpath(pathof(Trixi) |> dirname |> dirname, "examples", " # Expected errors are exactly the same as with TreeMesh! l2 = [0.00016263963870641478], linf = [0.0014537194925779984]) + + @testset "error-based step size control" begin + Trixi.mpi_isroot() && println("-"^100) + Trixi.mpi_isroot() && println("elixir_advection_basic.jl with error-based step size control") + + sol = solve(ode, RDPK3SpFSAL35(), abstol=1.0e-4, reltol=1.0e-4, + save_everystep=false, callback=callbacks, + internalnorm=ode_norm, + unstable_check=ode_unstable_check); summary_callback() + errors = analysis_callback(sol) + if Trixi.mpi_isroot() + @test errors.l2 ≈ [0.00016800412839949264] rtol=1.0e-4 + @test errors.linf ≈ [0.0014548839020096516] rtol=1.0e-4 + end + end end @trixi_testset "elixir_advection_amr.jl" begin diff --git a/test/test_mpi_tree.jl b/test/test_mpi_tree.jl index d8f98f17c53..79ecd39287b 100644 --- a/test/test_mpi_tree.jl +++ b/test/test_mpi_tree.jl @@ -124,6 +124,21 @@ const EXAMPLES_DIR = joinpath(pathof(Trixi) |> dirname |> dirname, "examples", " @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_euler_ec.jl"), l2 = [0.061751715597716854, 0.05018223615408711, 0.05018989446443463, 0.225871559730513], linf = [0.29347582879608825, 0.31081249232844693, 0.3107380389947736, 1.0540358049885143]) + + @testset "error-based step size control" begin + Trixi.mpi_isroot() && println("-"^100) + Trixi.mpi_isroot() && println("elixir_euler_ec.jl with error-based step size control") + + sol = solve(ode, RDPK3SpFSAL35(), abstol=1.0e-4, reltol=1.0e-4, + save_everystep=false, callback=callbacks, + internalnorm=ode_norm, + unstable_check=ode_unstable_check); summary_callback() + errors = analysis_callback(sol) + if Trixi.mpi_isroot() + @test errors.l2 ≈ [0.061653630426688116, 0.05006930431098764, 0.05007694316484242, 0.22550689872331683] rtol=1.0e-4 + @test errors.linf ≈ [0.28516937484583693, 0.2983633696512788, 0.297812036335975, 1.027368795517512] rtol=1.0e-4 + end + end end @trixi_testset "elixir_euler_vortex.jl" begin