diff --git a/Project.toml b/Project.toml
index 246af4bec54..5715725fd0d 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,6 +4,7 @@ authors = ["Michael Schlottke-Lakemper <m.schlottke-lakemper@hlrs.de>", "Gregor
 version = "0.4.29-pre"
 
 [deps]
+ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
 CodeTracking = "da1fd8a2-8d9e-5ec2-8556-3022fb5608a2"
 ConstructionBase = "187b0558-2788-49d3-abe0-74a17ed4e7c9"
 EllipsisNotation = "da5c29d0-fa7d-589e-88eb-ea29b0a81949"
@@ -40,6 +41,7 @@ TriplotRecipes = "808ab39a-a642-4abf-81ff-4cb34ebbffa3"
 UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
 
 [compat]
+ArrayInterface = "3"
 CodeTracking = "1.0.5"
 ConstructionBase = "1.3"
 EllipsisNotation = "1.0"
diff --git a/docs/src/reference-trixi.md b/docs/src/reference-trixi.md
index 940051115d5..73ef8fe27e8 100644
--- a/docs/src/reference-trixi.md
+++ b/docs/src/reference-trixi.md
@@ -5,5 +5,5 @@ CurrentModule = Trixi
 ```
 
 ```@autodocs
-Modules = [Trixi]
+Modules = [Trixi, Trixi.TrixiMPIArrays]
 ```
diff --git a/src/Trixi.jl b/src/Trixi.jl
index 04a8ba707df..7f497805b43 100644
--- a/src/Trixi.jl
+++ b/src/Trixi.jl
@@ -25,6 +25,7 @@ using SparseArrays: AbstractSparseMatrix, AbstractSparseMatrixCSC, sparse, dropt
 # import @reexport now to make it available for further imports/exports
 using Reexport: @reexport
 
+using ArrayInterface: static_length
 using SciMLBase: CallbackSet, DiscreteCallback,
                  ODEProblem, ODESolution, ODEFunction
 import SciMLBase: get_du, get_tmp_cache, u_modified!,
@@ -39,7 +40,6 @@ using HDF5: h5open, attributes
 using IfElse: ifelse
 using LinearMaps: LinearMap
 using LoopVectorization: LoopVectorization, @turbo, indices
-using LoopVectorization.ArrayInterface: static_length
 using MPI: MPI
 using MuladdMacro: @muladd
 using GeometryBasics: GeometryBasics
@@ -99,6 +99,7 @@ include("basic_types.jl")
 # Include all top-level source files
 include("auxiliary/auxiliary.jl")
 include("auxiliary/mpi.jl")
+include("auxiliary/mpi_arrays.jl")
 include("auxiliary/p4est.jl")
 include("equations/equations.jl")
 include("meshes/meshes.jl")
diff --git a/src/auxiliary/mpi_arrays.jl b/src/auxiliary/mpi_arrays.jl
new file mode 100644
index 00000000000..97b9a0d99ba
--- /dev/null
+++ b/src/auxiliary/mpi_arrays.jl
@@ -0,0 +1,241 @@
+
+# TODO: MPI. We keep this module inside Trixi for now. When it stabilizes and
+#            turns out to be generally useful, we can consider moving it to a
+#            separate package with simple test suite and documentation.
+module TrixiMPIArrays
+
+using ArrayInterface: ArrayInterface
+using MPI: MPI
+
+using ..Trixi: Trixi, mpi_comm
+
+export TrixiMPIArray, ode_norm, ode_unstable_check
+
+
+# Dispatch etc.
+# The following functions have special dispatch behavior for `TrixiMPIArray`s.
+# - `Trixi.wrap_array`:
+#   the wrapped array is wrapped again in a `TrixiMPIArray`
+# - `Trixi.wrap_array_native`:
+#   should not be changed since it should return a plain `Array`
+# - `Trixi.allocate_coefficients`:
+#   this handles the return type of initialization stuff when setting an IC
+#   with MPI
+# Besides these, we usually dispatch on MPI mesh types such as
+# `mesh::ParallelTreeMesh` or ``mesh::ParallelP4eestMesh` in Trixi.jl, since
+# this is consistent with other dispatches on the mesh type. However, we
+# dispatch on `u::TrixiMPIArray` whenever this allows simplifying some code,
+# e.g., because we can call a basic function on `parent(u)` and add some MPI
+# stuff on top.
+"""
+    TrixiMPIArray{T, N} <: AbstractArray{T, N}
+    TrixiMPIArray(u::AbstractArray{T, N})::TrixiMPIArray{T, N}
+
+A thin wrapper of arrays distributed via MPI used in Trixi.jl. The idea is that
+these arrays behave as much as possible as plain arrays would in an SPMD-style
+distributed MPI setting with exception of reductions, which are performed
+globally. This allows to use these arrays in ODE solvers such as the ones from
+OrdinaryDiffEq.jl, since vector space operations, broadcasting, and reductions
+are the only operations required for explicit time integration methods with
+fixed step sizes or adaptive step sizes based on CFL or error estimates.
+
+!!! warning "Experimental code"
+    This code is experimental and may be changed or removed in any future release.
+"""
+struct TrixiMPIArray{T, N, Parent<:AbstractArray{T, N}} <: AbstractArray{T, N}
+  u_local::Parent
+  mpi_comm::MPI.Comm
+end
+
+function TrixiMPIArray(u_local::AbstractArray{T, N}) where {T, N}
+  # TODO: MPI. Hard-coded to MPI.COMM_WORLD for now
+  mpi_comm = MPI.COMM_WORLD
+  TrixiMPIArray{T, N, typeof(u_local)}(u_local, mpi_comm)
+end
+
+
+# `Base.show` with additional helpful information
+function Base.show(io::IO, u::TrixiMPIArray)
+  print(io, "TrixiMPIArray wrapping ", parent(u))
+end
+
+function Base.show(io::IO, mime::MIME"text/plain", u::TrixiMPIArray)
+  print(io, "TrixiMPIArray wrapping ")
+  show(io, mime, parent(u))
+end
+
+
+# Custom interface and general Base interface not covered by other parts below
+Base.parent(u::TrixiMPIArray) = u.u_local
+Base.resize!(u::TrixiMPIArray, new_size) = resize!(parent(u), new_size)
+function Base.copy(u::TrixiMPIArray)
+  return TrixiMPIArray(copy(parent(u)), mpi_comm(u))
+end
+
+Trixi.mpi_comm(u::TrixiMPIArray) = u.mpi_comm
+
+
+# Implementation of the abstract array interface of Base
+# See https://docs.julialang.org/en/v1/manual/interfaces/#man-interface-array
+Base.size(u::TrixiMPIArray) = size(parent(u))
+Base.getindex(u::TrixiMPIArray, idx) = getindex(parent(u), idx)
+Base.setindex!(u::TrixiMPIArray, v, idx) = setindex!(parent(u), v, idx)
+Base.IndexStyle(::Type{TrixiMPIArray{T, N, Parent}}) where {T, N, Parent} = IndexStyle(Parent)
+function Base.similar(u::TrixiMPIArray, ::Type{S}, dims::NTuple{N, Int}) where {S, N}
+  return TrixiMPIArray(similar(parent(u), S, dims), mpi_comm(u))
+end
+Base.axes(u::TrixiMPIArray)	= axes(parent(u))
+
+
+# Implementation of the strided array interface of Base
+# See https://docs.julialang.org/en/v1/manual/interfaces/#man-interface-strided-arrays
+Base.strides(u::TrixiMPIArray) = strides(parent(u))
+function Base.unsafe_convert(::Type{Ptr{T}}, u::TrixiMPIArray{T}) where {T}
+  return Base.unsafe_convert(Ptr{T}, parent(u))
+end
+Base.elsize(::Type{TrixiMPIArray{T, N, Parent}}) where {T, N, Parent} = Base.elsize(Parent)
+
+
+# We need to customize broadcasting since broadcasting expressions allocating
+# an output would return plain `Array`s otherwise, losing the MPI information.
+# Such allocating broadcasting calls are used for example when determining the
+# initial step size in OrdinaryDiffEq.jl.
+# However, everything else appears to be fine, i.e., all broadcasting calls
+# with a given output storage location work fine. In particular, threaded
+# broadcasting with FastBroadcast.jl works fine, e.g., when using threaded RK
+# methods such as `SSPRK43(thread=OrdinaryDiffEq.True())`.
+# See also
+# https://github.com/YingboMa/FastBroadcast.jl
+# https://docs.julialang.org/en/v1/manual/interfaces/#man-interfaces-broadcasting
+Base.BroadcastStyle(::Type{<:TrixiMPIArray}) = Broadcast.ArrayStyle{TrixiMPIArray}()
+
+function Base.similar(bc::Broadcast.Broadcasted{Broadcast.ArrayStyle{TrixiMPIArray}}, ::Type{ElType}) where ElType
+  # Scan the inputs for the first TrixiMPIArray and use that to create a `similar`
+  # output array with MPI information
+  # See also https://docs.julialang.org/en/v1/manual/interfaces/#Selecting-an-appropriate-output-array
+  A = find_mpi_array(bc)
+  similar(A, axes(bc))
+end
+# `A = find_mpi_array(As)` returns the first TrixiMPIArray among the arguments
+find_mpi_array(bc::Base.Broadcast.Broadcasted) = find_mpi_array(bc.args)
+find_mpi_array(args::Tuple) = find_mpi_array(find_mpi_array(args[1]), Base.tail(args))
+find_mpi_array(x) = x
+find_mpi_array(::Tuple{}) = nothing
+find_mpi_array(a::TrixiMPIArray, rest) = a
+find_mpi_array(::Any, rest) = find_mpi_array(rest)
+
+
+# Implementation of methods from ArrayInterface.jl for use with
+# LoopVectorization.jl etc.
+# See https://juliaarrays.github.io/ArrayInterface.jl/stable/
+ArrayInterface.parent_type(::Type{TrixiMPIArray{T, N, Parent}}) where {T, N, Parent} = Parent
+
+
+# TODO: MPI. For now, we do not implement specializations of LinearAlgebra
+#            functions such as `norm` or `dot`. We might revisit this again
+#            in the future.
+
+
+# `mapreduce` functionality from Base using global reductions via MPI communication
+# for use in, e.g., error-based step size control in OrdinaryDiffEq.jl
+# OBS! This makes reductions a global, blocking operation that will stall unless
+#      executed on all MPI ranks simultaneously
+function Base.mapreduce(f::F, op::Op, u::TrixiMPIArray; kwargs...) where {F, Op}
+  local_value = mapreduce(f, op, parent(u); kwargs...)
+  return MPI.Allreduce(local_value, op, mpi_comm(u))
+end
+
+
+# Default settings of OrdinaryDiffEq etc.
+# Interesting options could be
+# - ODE_DEFAULT_UNSTABLE_CHECK
+# - ODE_DEFAULT_ISOUTOFDOMAIN (disabled by default)
+# - ODE_DEFAULT_NORM
+# See https://github.com/SciML/DiffEqBase.jl/blob/master/src/common_defaults.jl
+#
+# Problems and inconsistencies with possible global `length`s of TrixiMPIArrays
+#
+# A basic question is how to handle `length`. We want `TrixiMPIArray`s to behave
+# like regular `Array`s in most code, e.g., when looping over an array (which
+# should use `eachindex`). At the same time, we want to be able to use adaptive
+# time stepping using error estimates in OrdinaryDiffEq.jl. There, the default
+# norm `ODE_DEFAULT_NORM` is the one described in the book of Hairer & Wanner,
+# i.e., it includes a division by the global `length` of the array. We could
+# specialize `ODE_DEFAULT_NORM` accordingly, but that requires depending on
+# DiffEqBase.jl (instead of SciMLBase.jl). Alternatively, we could implement
+# this via Requires.jl, but that will prevent precompilation and maybe trigger
+# invalidations. Alternatively, could implement our own norm and pass that as
+# `internalnorm`. We could also try to use the least intrusive approach for now
+# and specialize `length` to return a global length while making sure that all
+# local behavior is still working as expected (if using `eachindex` instead of
+# `1:length` etc.). This means that we would have
+# `eachindex(u) != Base.OneTo(length(u))` for `u::TrixiMPIArray` in general,
+# even if `u` and its underlying array use one-based indexing.
+# Some consequences are that we need to implement specializations of `show`,
+# since the default ones call `length`. However, this doesn't work if not all
+# ranks call the same method, e.g., when showing an array only on one rank.
+# Moreover, we would need to specialize `copyto!` and probably many other
+# functions. Since this can lead to hard-to-find bugs and problems in MPI code,
+# we use a more verbose approach. Thus, we let `length` be a local `length` and
+# provide a new function `ode_norm` to be passed as `internalnorm` of
+# OrdinaryDiffEq's `solve` function.
+
+
+# Specialization of `view`. Without these, `view`s of arrays returned by
+# `wrap_array` with multiple conserved variables do not always work...
+# This may also be related to the use of a global `length`?
+Base.view(u::TrixiMPIArray, idx::Vararg{Any,N}) where {N} = view(parent(u), idx...)
+
+
+"""
+    ode_norm(u, t)
+
+Implementation of the weighted L2 norm of Hairer and Wanner used for error-based
+step size control in OrdinaryDiffEq.jl. This function is aware of
+[`TrixiMPIArray`](@ref)s, handling them appropriately with global MPI
+communication.
+
+You must pass this function as keyword argument
+`internalnorm=ode_norm`
+of `solve` when using error-based step size control with MPI parallel execution
+of Trixi.jl.
+
+See [`https://diffeq.sciml.ai/stable/basics/common_solver_opts/#advanced_adaptive_stepsize_control`](https://diffeq.sciml.ai/stable/basics/common_solver_opts/#advanced_adaptive_stepsize_control)
+
+!!! warning "Experimental code"
+    This code is experimental and may be changed or removed in any future release.
+"""
+ode_norm(u, t) = @fastmath abs(u)
+ode_norm(u::AbstractArray, t) = sqrt(sum(abs2, u) / length(u))
+function ode_norm(u::TrixiMPIArray, t)
+  local_sumabs2 = sum(abs2, parent(u))
+  local_length  = length(parent(u))
+  global_sumabs2, global_length = MPI.Allreduce([local_sumabs2, local_length], +, mpi_comm(u))
+  return sqrt(global_sumabs2 / global_length)
+end
+
+
+"""
+    ode_unstable_check(dt, u, semi, t)
+
+Implementation of the basic check for instability used in OrdinaryDiffEq.jl.
+Instead of checking something like `any(isnan, u)`, this function just checks
+`isnan(dt)`. This helps when using [`TrixiMPIArray`](@ref)s, since no additional
+global communication is required and all ranks will return the same result.
+
+You should pass this function as keyword argument
+`unstable_check=ode_unstable_check`
+of `solve` when using error-based step size control with MPI parallel execution
+of Trixi.jl.
+
+See [`https://diffeq.sciml.ai/stable/basics/common_solver_opts/#Miscellaneous`](https://diffeq.sciml.ai/stable/basics/common_solver_opts/#Miscellaneous)
+
+!!! warning "Experimental code"
+    This code is experimental and may be changed or removed in any future release.
+"""
+ode_unstable_check(dt, u, semi, t) = isnan(dt)
+
+
+end # module
+
+@reexport using .TrixiMPIArrays: TrixiMPIArrays, TrixiMPIArray, ode_norm, ode_unstable_check
diff --git a/src/callbacks_step/amr.jl b/src/callbacks_step/amr.jl
index 6a1a73c838f..17445bb5fe0 100644
--- a/src/callbacks_step/amr.jl
+++ b/src/callbacks_step/amr.jl
@@ -221,7 +221,7 @@ function (amr_callback::AMRCallback)(u_ode::AbstractVector, mesh::TreeMesh,
   @unpack to_refine, to_coarsen = amr_callback.amr_cache
   empty!(to_refine)
   empty!(to_coarsen)
-  for element in 1:length(lambda)
+  for element in eachindex(lambda)
     controller_value = lambda[element]
     if controller_value > 0
       push!(to_refine, leaf_cell_ids[element])
@@ -282,7 +282,7 @@ function (amr_callback::AMRCallback)(u_ode::AbstractVector, mesh::TreeMesh,
     end
 
     # Extract only those parent cells for which all children should be coarsened
-    to_coarsen = collect(1:length(parents_to_coarsen))[parents_to_coarsen .== 2^ndims(mesh)]
+    to_coarsen = collect(eachindex(parents_to_coarsen))[parents_to_coarsen .== 2^ndims(mesh)]
 
     # Finally, coarsen mesh
     coarsened_original_cells = @trixi_timeit timer() "mesh" coarsen!(mesh.tree, to_coarsen)
diff --git a/src/callbacks_step/analysis_dg2d_parallel.jl b/src/callbacks_step/analysis_dg2d_parallel.jl
index 499bcad6328..a80743d7fe4 100644
--- a/src/callbacks_step/analysis_dg2d_parallel.jl
+++ b/src/callbacks_step/analysis_dg2d_parallel.jl
@@ -136,17 +136,18 @@ function calc_error_norms(func, u, t, analyzer,
 end
 
 
-function integrate_via_indices(func::Func, u,
-                               mesh::ParallelTreeMesh{2}, equations, dg::DGSEM, cache,
+# We need to dispatch on `u::TrixiMPIArray` instead of `mesh::TreeMesh{2}` to
+# simply use `parent(u)` instead of some `invoke` call.
+function integrate_via_indices(func::Func, u::TrixiMPIArray,
+                               mesh::TreeMesh{2}, equations, dg::DGSEM, cache,
                                args...; normalize=true) where {Func}
-  # call the method accepting a general `mesh::TreeMesh{2}`
-  # TODO: MPI, we should improve this; maybe we should dispatch on `u`
-  #       and create some MPI array type, overloading broadcasting and mapreduce etc.
-  #       Then, this specific array type should also work well with DiffEq etc.
-  local_integral = invoke(integrate_via_indices,
-    Tuple{typeof(func), typeof(u), TreeMesh{2}, typeof(equations),
-          typeof(dg), typeof(cache), map(typeof, args)...},
-    func, u, mesh, equations, dg, cache, args..., normalize=normalize)
+  # Call the method for the local degrees of freedom and perform a global
+  # MPI reduction afterwards.
+  # Note that the simple `TreeMesh` implements an efficient way to compute
+  # the global volume without requiring communication. This global volume is
+  # already used when `normalize=true`.
+  local_integral = integrate_via_indices(func, parent(u), mesh, equations, dg,
+                                         cache, args...; normalize)
 
   # OBS! Global results are only calculated on MPI root, all other domains receive `nothing`
   global_integral = MPI.Reduce!(Ref(local_integral), +, mpi_root(), mpi_comm())
diff --git a/src/callbacks_step/save_restart_dg.jl b/src/callbacks_step/save_restart_dg.jl
index 825ea3bcd5b..6a7e6e28cbd 100644
--- a/src/callbacks_step/save_restart_dg.jl
+++ b/src/callbacks_step/save_restart_dg.jl
@@ -84,7 +84,9 @@ function load_restart_file(mesh::Union{SerialTreeMesh, StructuredMesh, Unstructu
   return u_ode
 end
 
-
+# Note that we cannot dispatch on `u::TrixiMPIArray` since we use
+# `wrap_array_native` before calling this method, loosing the MPI array
+# wrapper type.
 function save_restart_file(u, time, dt, timestep,
                            mesh::Union{ParallelTreeMesh, ParallelP4estMesh}, equations, dg::DG, cache,
                            restart_callback)
diff --git a/src/callbacks_step/save_solution_dg.jl b/src/callbacks_step/save_solution_dg.jl
index c1708ca6820..43299454714 100644
--- a/src/callbacks_step/save_solution_dg.jl
+++ b/src/callbacks_step/save_solution_dg.jl
@@ -73,7 +73,9 @@ function save_solution_file(u, time, dt, timestep,
   return filename
 end
 
-
+# Note that we cannot dispatch on `u::TrixiMPIArray` since we use
+# `wrap_array_native` before calling this method, loosing the MPI array
+# wrapper type.
 function save_solution_file(u, time, dt, timestep,
                             mesh::Union{ParallelTreeMesh, ParallelP4estMesh}, equations, dg::DG, cache,
                             solution_callback, element_variables=Dict{Symbol,Any}();
diff --git a/src/callbacks_step/stepsize_dg2d.jl b/src/callbacks_step/stepsize_dg2d.jl
index 821e4e744fe..6dd53f09ae1 100644
--- a/src/callbacks_step/stepsize_dg2d.jl
+++ b/src/callbacks_step/stepsize_dg2d.jl
@@ -26,7 +26,6 @@ function max_dt(u, t, mesh::TreeMesh{2},
   return 2 / (nnodes(dg) * max_scaled_speed)
 end
 
-
 function max_dt(u, t, mesh::TreeMesh{2},
                 constant_speed::Val{true}, equations, dg::DG, cache)
   # to avoid a division by zero if the speed vanishes everywhere,
@@ -43,32 +42,21 @@ function max_dt(u, t, mesh::TreeMesh{2},
 end
 
 
-function max_dt(u, t, mesh::ParallelTreeMesh{2},
+function max_dt(u::TrixiMPIArray, t, mesh::TreeMesh{2},
                 constant_speed::Val{false}, equations, dg::DG, cache)
-  # call the method accepting a general `mesh::TreeMesh{2}`
-  # TODO: MPI, we should improve this; maybe we should dispatch on `u`
-  #       and create some MPI array type, overloading broadcasting and mapreduce etc.
-  #       Then, this specific array type should also work well with DiffEq etc.
-  dt = invoke(max_dt,
-    Tuple{typeof(u), typeof(t), TreeMesh{2},
-          typeof(constant_speed), typeof(equations), typeof(dg), typeof(cache)},
-    u, t, mesh, constant_speed, equations, dg, cache)
+  # call the method for the local degrees of freedom and perform a global
+  # MPI reduction afterwards
+  dt = max_dt(parent(u), t, mesh, constant_speed, equations, dg, cache)
   dt = MPI.Allreduce!(Ref(dt), min, mpi_comm())[]
 
   return dt
 end
 
-
-function max_dt(u, t, mesh::ParallelTreeMesh{2},
+function max_dt(u::TrixiMPIArray, t, mesh::TreeMesh{2},
                 constant_speed::Val{true}, equations, dg::DG, cache)
-  # call the method accepting a general `mesh::TreeMesh{2}`
-  # TODO: MPI, we should improve this; maybe we should dispatch on `u`
-  #       and create some MPI array type, overloading broadcasting and mapreduce etc.
-  #       Then, this specific array type should also work well with DiffEq etc.
-  dt = invoke(max_dt,
-    Tuple{typeof(u), typeof(t), TreeMesh{2},
-          typeof(constant_speed), typeof(equations), typeof(dg), typeof(cache)},
-    u, t, mesh, constant_speed, equations, dg, cache)
+  # call the method for the local degrees of freedom and perform a global
+  # MPI reduction afterwards
+  dt = max_dt(parent(u), t, mesh, constant_speed, equations, dg, cache)
   dt = MPI.Allreduce!(Ref(dt), min, mpi_comm())[]
 
   return dt
@@ -107,7 +95,6 @@ function max_dt(u, t, mesh::Union{StructuredMesh{2}, UnstructuredMesh2D, P4estMe
   return 2 / (nnodes(dg) * max_scaled_speed)
 end
 
-
 function max_dt(u, t, mesh::Union{StructuredMesh{2}, UnstructuredMesh2D, P4estMesh{2}},
                 constant_speed::Val{true}, equations, dg::DG, cache)
   @unpack contravariant_vectors, inverse_jacobian = cache.elements
@@ -135,32 +122,21 @@ function max_dt(u, t, mesh::Union{StructuredMesh{2}, UnstructuredMesh2D, P4estMe
 end
 
 
-function max_dt(u, t, mesh::ParallelP4estMesh{2},
+function max_dt(u::TrixiMPIArray, t, mesh::P4estMesh{2},
                 constant_speed::Val{false}, equations, dg::DG, cache)
-  # call the method accepting a general `mesh::P4estMesh{2}`
-  # TODO: MPI, we should improve this; maybe we should dispatch on `u`
-  #       and create some MPI array type, overloading broadcasting and mapreduce etc.
-  #       Then, this specific array type should also work well with DiffEq etc.
-  dt = invoke(max_dt,
-    Tuple{typeof(u), typeof(t), P4estMesh{2},
-          typeof(constant_speed), typeof(equations), typeof(dg), typeof(cache)},
-    u, t, mesh, constant_speed, equations, dg, cache)
+  # call the method for the local degrees of freedom and perform a global
+  # MPI reduction afterwards
+  dt = max_dt(parent(u), t, mesh, constant_speed, equations, dg, cache)
   dt = MPI.Allreduce!(Ref(dt), min, mpi_comm())[]
 
   return dt
 end
 
-
-function max_dt(u, t, mesh::ParallelP4estMesh{2},
+function max_dt(u::TrixiMPIArray, t, mesh::P4estMesh{2},
                 constant_speed::Val{true}, equations, dg::DG, cache)
-  # call the method accepting a general `mesh::P4estMesh{2}`
-  # TODO: MPI, we should improve this; maybe we should dispatch on `u`
-  #       and create some MPI array type, overloading broadcasting and mapreduce etc.
-  #       Then, this specific array type should also work well with DiffEq etc.
-  dt = invoke(max_dt,
-    Tuple{typeof(u), typeof(t), P4estMesh{2},
-          typeof(constant_speed), typeof(equations), typeof(dg), typeof(cache)},
-    u, t, mesh, constant_speed, equations, dg, cache)
+  # call the method for the local degrees of freedom and perform a global
+  # MPI reduction afterwards
+  dt = max_dt(parent(u), t, mesh, constant_speed, equations, dg, cache)
   dt = MPI.Allreduce!(Ref(dt), min, mpi_comm())[]
 
   return dt
diff --git a/src/callbacks_step/stepsize_dg3d.jl b/src/callbacks_step/stepsize_dg3d.jl
index a7a12372396..c424ebb5ba7 100644
--- a/src/callbacks_step/stepsize_dg3d.jl
+++ b/src/callbacks_step/stepsize_dg3d.jl
@@ -109,32 +109,22 @@ function max_dt(u, t, mesh::Union{StructuredMesh{3}, P4estMesh{3}},
 end
 
 
-function max_dt(u, t, mesh::ParallelP4estMesh{3},
+function max_dt(u::TrixiMPIArray, t, mesh::P4estMesh{3},
                 constant_speed::Val{false}, equations, dg::DG, cache)
-  # call the method accepting a general `mesh::P4estMesh{3}`
-  # TODO: MPI, we should improve this; maybe we should dispatch on `u`
-  #       and create some MPI array type, overloading broadcasting and mapreduce etc.
-  #       Then, this specific array type should also work well with DiffEq etc.
-  dt = invoke(max_dt,
-    Tuple{typeof(u), typeof(t), P4estMesh{3},
-          typeof(constant_speed), typeof(equations), typeof(dg), typeof(cache)},
-    u, t, mesh, constant_speed, equations, dg, cache)
+  # call the method for the local degrees of freedom and perform a global
+  # MPI reduction afterwards
+  dt = max_dt(parent(u), t, mesh, constant_speed, equations, dg, cache)
   dt = MPI.Allreduce!(Ref(dt), min, mpi_comm())[]
 
   return dt
 end
 
 
-function max_dt(u, t, mesh::ParallelP4estMesh{3},
+function max_dt(u::TrixiMPIArray, t, mesh::P4estMesh{3},
                 constant_speed::Val{true}, equations, dg::DG, cache)
-  # call the method accepting a general `mesh::P4estMesh{3}`
-  # TODO: MPI, we should improve this; maybe we should dispatch on `u`
-  #       and create some MPI array type, overloading broadcasting and mapreduce etc.
-  #       Then, this specific array type should also work well with DiffEq etc.
-  dt = invoke(max_dt,
-    Tuple{typeof(u), typeof(t), P4estMesh{3},
-          typeof(constant_speed), typeof(equations), typeof(dg), typeof(cache)},
-    u, t, mesh, constant_speed, equations, dg, cache)
+  # call the method for the local degrees of freedom and perform a global
+  # MPI reduction afterwards
+  dt = max_dt(parent(u), t, mesh, constant_speed, equations, dg, cache)
   dt = MPI.Allreduce!(Ref(dt), min, mpi_comm())[]
 
   return dt
diff --git a/src/callbacks_step/time_series_dg2d.jl b/src/callbacks_step/time_series_dg2d.jl
index 778739a824b..a5b097266fd 100644
--- a/src/callbacks_step/time_series_dg2d.jl
+++ b/src/callbacks_step/time_series_dg2d.jl
@@ -137,7 +137,7 @@ function record_state_at_points!(point_data, u, solution_variables, n_solution_v
     for j in eachnode(dg), i in eachnode(dg)
       u_node = solution_variables(get_node_vars(u, equations, dg, i, j, element_id), equations)
 
-      for v in 1:length(u_node)
+      for v in eachindex(u_node)
         data[old_length + v] += (u_node[v]
                                 * interpolating_polynomials[i, 1, index]
                                 * interpolating_polynomials[j, 2, index])
diff --git a/src/meshes/meshes.jl b/src/meshes/meshes.jl
index a6dcbe132d8..3713d9c8935 100644
--- a/src/meshes/meshes.jl
+++ b/src/meshes/meshes.jl
@@ -4,6 +4,8 @@
 # See https://ranocha.de/blog/Optimizing_EC_Trixi for further details.
 @muladd begin
 
+# Default value - meshes supporting parallelization via MPI must opt-in
+mpi_parallel(mesh::AbstractMesh) = Val{false}()
 
 include("tree_mesh.jl")
 include("structured_mesh.jl")
diff --git a/src/solvers/dg.jl b/src/solvers/dg.jl
index 86cbac1ee68..141be1d5fdc 100644
--- a/src/solvers/dg.jl
+++ b/src/solvers/dg.jl
@@ -369,9 +369,20 @@ include("dgsem/dgsem.jl")
 
 
 function allocate_coefficients(mesh::AbstractMesh, equations, dg::DG, cache)
-  # We must allocate a `Vector` in order to be able to `resize!` it (AMR).
+  # We must allocate a `Vector` in order to be able to `resize!` it for AMR.
+  # Currently, Julia does not allow resizing multi-dimensional `Array`s, see
+  # https://github.com/JuliaLang/julia/issues/37900
   # cf. wrap_array
-  zeros(eltype(cache.elements), nvariables(equations) * nnodes(dg)^ndims(mesh) * nelements(dg, cache))
+  u = zeros(eltype(cache.elements), nvariables(equations) * nnodes(dg)^ndims(mesh) * nelements(dg, cache))
+
+  # This function is used every time we allocate some new coefficient vector.
+  # Thus, we can check here whether MPI parallelization is set up and wrap
+  # the coefficient vector for use with MPI if necessary.
+  if mpi_parallel(mesh) === Val{true}()
+    return TrixiMPIArray(u)
+  else
+    return u
+  end
 end
 
 @inline function wrap_array(u_ode::AbstractVector, mesh::AbstractMesh, equations, dg::DGSEM, cache)
@@ -405,13 +416,19 @@ end
     #     `@batch` from Polyester.jl or something similar. Using Polyester.jl
     #     is probably the best option since everything will be handed over to
     #     Chris Elrod, one of the best performance software engineers for Julia.
-    PtrArray(pointer(u_ode),
-             (StaticInt(nvariables(equations)), ntuple(_ -> StaticInt(nnodes(dg)), ndims(mesh))..., nelements(dg, cache)))
-            #  (nvariables(equations), ntuple(_ -> nnodes(dg), ndims(mesh))..., nelements(dg, cache)))
+    u = PtrArray(pointer(u_ode),
+                 (StaticInt(nvariables(equations)), ntuple(_ -> StaticInt(nnodes(dg)), ndims(mesh))..., nelements(dg, cache)))
+               # (nvariables(equations), ntuple(_ -> nnodes(dg), ndims(mesh))..., nelements(dg, cache)))
   else
     # The following version is reasonably fast and allows us to `resize!(u_ode, ...)`.
-    unsafe_wrap(Array{eltype(u_ode), ndims(mesh)+2}, pointer(u_ode),
-                (nvariables(equations), ntuple(_ -> nnodes(dg), ndims(mesh))..., nelements(dg, cache)))
+    u = unsafe_wrap(Array{eltype(u_ode), ndims(mesh)+2}, pointer(u_ode),
+                    (nvariables(equations), ntuple(_ -> nnodes(dg), ndims(mesh))..., nelements(dg, cache)))
+  end
+
+  if u_ode isa TrixiMPIArray
+    return TrixiMPIArray(u, mpi_comm(u_ode))
+  else
+    return u
   end
 end
 
diff --git a/src/solvers/dgsem_structured/dg_2d_compressible_euler.jl b/src/solvers/dgsem_structured/dg_2d_compressible_euler.jl
index 4794aad8cb3..a7493b4fe2b 100644
--- a/src/solvers/dgsem_structured/dg_2d_compressible_euler.jl
+++ b/src/solvers/dgsem_structured/dg_2d_compressible_euler.jl
@@ -18,13 +18,14 @@
 # We specialize on `PtrArray` since these will be returned by `Trixi.wrap_array`
 # if LoopVectorization.jl can handle the array types. This ensures that `@turbo`
 # works efficiently here.
-@inline function split_form_kernel!(_du::PtrArray, u_cons::PtrArray,
+@inline function split_form_kernel!(_du::Union{PtrArray, TrixiMPIArray{T, N, <:PtrArray}},
+                                    u_cons::Union{PtrArray, TrixiMPIArray{T, N, <:PtrArray}},
                                     element,
                                     mesh::Union{StructuredMesh{2}, UnstructuredMesh2D, P4estMesh{2}},
                                     nonconservative_terms::Val{false},
                                     equations::CompressibleEulerEquations2D,
                                     volume_flux::typeof(flux_shima_etal_turbo),
-                                    dg::DGSEM, cache, alpha)
+                                    dg::DGSEM, cache, alpha) where {T, N}
   @unpack derivative_split = dg.basis
   @unpack contravariant_vectors = cache.elements
 
@@ -220,13 +221,14 @@ end
 
 
 
-@inline function split_form_kernel!(_du::PtrArray, u_cons::PtrArray,
+@inline function split_form_kernel!(_du::Union{PtrArray, TrixiMPIArray{T, N, <:PtrArray}},
+                                    u_cons::Union{PtrArray, TrixiMPIArray{T, N, <:PtrArray}},
                                     element,
                                     mesh::Union{StructuredMesh{2}, UnstructuredMesh2D, P4estMesh{2}},
                                     nonconservative_terms::Val{false},
                                     equations::CompressibleEulerEquations2D,
                                     volume_flux::typeof(flux_ranocha_turbo),
-                                    dg::DGSEM, cache, alpha)
+                                    dg::DGSEM, cache, alpha) where {T, N}
   @unpack derivative_split = dg.basis
   @unpack contravariant_vectors = cache.elements
 
diff --git a/src/solvers/dgsem_structured/dg_3d_compressible_euler.jl b/src/solvers/dgsem_structured/dg_3d_compressible_euler.jl
index 1b3180076f9..89a4f5e9390 100644
--- a/src/solvers/dgsem_structured/dg_3d_compressible_euler.jl
+++ b/src/solvers/dgsem_structured/dg_3d_compressible_euler.jl
@@ -18,12 +18,13 @@
 # We specialize on `PtrArray` since these will be returned by `Trixi.wrap_array`
 # if LoopVectorization.jl can handle the array types. This ensures that `@turbo`
 # works efficiently here.
-@inline function split_form_kernel!(_du::PtrArray, u_cons::PtrArray,
+@inline function split_form_kernel!(_du::Union{PtrArray, TrixiMPIArray{T, N, <:PtrArray}},
+                                    u_cons::Union{PtrArray, TrixiMPIArray{T, N, <:PtrArray}},
                                     element, mesh::Union{StructuredMesh{3}, P4estMesh{3}},
                                     nonconservative_terms::Val{false},
                                     equations::CompressibleEulerEquations3D,
                                     volume_flux::typeof(flux_shima_etal_turbo),
-                                    dg::DGSEM, cache, alpha)
+                                    dg::DGSEM, cache, alpha) where {T, N}
   @unpack derivative_split = dg.basis
   @unpack contravariant_vectors = cache.elements
 
diff --git a/src/solvers/dgsem_tree/dg_2d_compressible_euler.jl b/src/solvers/dgsem_tree/dg_2d_compressible_euler.jl
index faaed4fe68a..1d2970e90cb 100644
--- a/src/solvers/dgsem_tree/dg_2d_compressible_euler.jl
+++ b/src/solvers/dgsem_tree/dg_2d_compressible_euler.jl
@@ -66,12 +66,13 @@ end # muladd
 # We specialize on `PtrArray` since these will be returned by `Trixi.wrap_array`
 # if LoopVectorization.jl can handle the array types. This ensures that `@turbo`
 # works efficiently here.
-@inline function split_form_kernel!(_du::PtrArray, u_cons::PtrArray,
+@inline function split_form_kernel!(_du::Union{PtrArray, TrixiMPIArray{T, N, <:PtrArray}},
+                                    u_cons::Union{PtrArray, TrixiMPIArray{T, N, <:PtrArray}},
                                     element, mesh::TreeMesh{2},
                                     nonconservative_terms::Val{false},
                                     equations::CompressibleEulerEquations2D,
                                     volume_flux::typeof(flux_shima_etal_turbo),
-                                    dg::DGSEM, cache, alpha)
+                                    dg::DGSEM, cache, alpha) where {T, N}
   @unpack derivative_split = dg.basis
 
   # Create a temporary array that will be used to store the RHS with permuted
@@ -228,12 +229,13 @@ end
 
 
 
-@inline function split_form_kernel!(_du::PtrArray, u_cons::PtrArray,
+@inline function split_form_kernel!(_du::Union{PtrArray, TrixiMPIArray{T, N, <:PtrArray}},
+                                    u_cons::Union{PtrArray, TrixiMPIArray{T, N, <:PtrArray}},
                                     element, mesh::TreeMesh{2},
                                     nonconservative_terms::Val{false},
                                     equations::CompressibleEulerEquations2D,
                                     volume_flux::typeof(flux_ranocha_turbo),
-                                    dg::DGSEM, cache, alpha)
+                                    dg::DGSEM, cache, alpha) where {T, N}
   @unpack derivative_split = dg.basis
 
   # Create a temporary array that will be used to store the RHS with permuted
diff --git a/src/solvers/dgsem_tree/dg_3d_compressible_euler.jl b/src/solvers/dgsem_tree/dg_3d_compressible_euler.jl
index f78d0cea366..0b4f1fc9d44 100644
--- a/src/solvers/dgsem_tree/dg_3d_compressible_euler.jl
+++ b/src/solvers/dgsem_tree/dg_3d_compressible_euler.jl
@@ -17,12 +17,13 @@
 # We specialize on `PtrArray` since these will be returned by `Trixi.wrap_array`
 # if LoopVectorization.jl can handle the array types. This ensures that `@turbo`
 # works efficiently here.
-@inline function split_form_kernel!(_du::PtrArray, u_cons::PtrArray,
+@inline function split_form_kernel!(_du::Union{PtrArray, TrixiMPIArray{T, N, <:PtrArray}},
+                                    u_cons::Union{PtrArray, TrixiMPIArray{T, N, <:PtrArray}},
                                     element, mesh::TreeMesh{3},
                                     nonconservative_terms::Val{false},
                                     equations::CompressibleEulerEquations3D,
                                     volume_flux::typeof(flux_shima_etal_turbo),
-                                    dg::DGSEM, cache, alpha)
+                                    dg::DGSEM, cache, alpha) where {T, N}
   @unpack derivative_split = dg.basis
 
   # Create a temporary array that will be used to store the RHS with permuted
@@ -262,12 +263,13 @@ end
 
 
 
-@inline function split_form_kernel!(_du::PtrArray, u_cons::PtrArray,
+@inline function split_form_kernel!(_du::Union{PtrArray, TrixiMPIArray{T, N, <:PtrArray}},
+                                    u_cons::Union{PtrArray, TrixiMPIArray{T, N, <:PtrArray}},
                                     element, mesh::TreeMesh{3},
                                     nonconservative_terms::Val{false},
                                     equations::CompressibleEulerEquations3D,
                                     volume_flux::typeof(flux_ranocha_turbo),
-                                    dg::DGSEM, cache, alpha)
+                                    dg::DGSEM, cache, alpha) where {T, N}
   @unpack derivative_split = dg.basis
 
   # Create a temporary array that will be used to store the RHS with permuted
diff --git a/test/test_mpi.jl b/test/test_mpi.jl
index 4a210f1ee6b..77d0c656c4b 100644
--- a/test/test_mpi.jl
+++ b/test/test_mpi.jl
@@ -16,6 +16,9 @@ Trixi.mpi_isroot() && isdir(outdir) && rm(outdir, recursive=true)
 CI_ON_WINDOWS = (get(ENV, "GITHUB_ACTIONS", false) == "true") && Sys.iswindows()
 
 @testset "MPI" begin
+  # TrixiMPIArray tests
+  include("test_mpi_arrays.jl")
+
   # TreeMesh tests
   include("test_mpi_tree.jl")
 
diff --git a/test/test_mpi_arrays.jl b/test/test_mpi_arrays.jl
new file mode 100644
index 00000000000..b40e5f2af68
--- /dev/null
+++ b/test/test_mpi_arrays.jl
@@ -0,0 +1,69 @@
+module TestTrixiMPIArrays
+
+using Test
+using LinearAlgebra: norm
+using Trixi: TrixiMPIArray, mpi_isroot, mpi_nranks, ode_norm, ode_unstable_check
+
+
+@testset "TestTrixiMPIArrays" begin
+
+@testset "show" begin
+  u_parent = zeros(5)
+  u_mpi = TrixiMPIArray(u_parent)
+
+  # output on all ranks
+  @test_nowarn show(stdout, u_mpi)
+  @test_nowarn show(stdout, MIME"text/plain"(), u_mpi)
+
+  # test whether show works also on a single rank
+  if mpi_isroot()
+    @test_nowarn show(stdout, u_mpi)
+    @test_nowarn show(stdout, MIME"text/plain"(), u_mpi)
+  end
+end
+
+
+@testset "vector interface" begin
+  u_parent = ones(5)
+  u_mpi = TrixiMPIArray(u_parent)
+
+  @test sum(u_mpi) ≈ 5 * mpi_nranks()
+
+  @test_nowarn resize!(u_mpi, 4)
+  @test sum(u_mpi) ≈ 4 * mpi_nranks()
+  @test sum(u_parent) == 4
+
+  u_mpi2 = copy(u_mpi)
+  @test sum(u_mpi2) ≈ sum(u_mpi)
+
+  res = @. u_mpi / u_mpi
+  @test res isa TrixiMPIArray
+
+  res = @. u_mpi / u_parent
+  @test res isa TrixiMPIArray
+
+  res = @. 5 * u_mpi
+  @test res isa TrixiMPIArray
+
+  @test strides(u_mpi) == strides(u_parent)
+  @test Base.elsize(u_mpi) == sizeof(Float64)
+end
+
+
+@testset "ODE interface" begin
+  u_parent = [1.0, -2.0, 3.5, -4.0]
+  u_mpi = TrixiMPIArray(u_parent)
+
+  # duplicating a vector doesn't change the norm weighted by the global length
+  @test ode_norm(u_parent, 0.0) ≈ norm(u_parent) / sqrt(length(u_parent))
+  @test ode_norm(u_mpi, 0.0) ≈ ode_norm(u_parent, 0.0)
+
+  u_parent[1] = NaN
+  @test ode_unstable_check(rand(), u_mpi, nothing, 0.0) == false
+  @test ode_unstable_check(NaN, u_mpi, nothing, 0.0) == true
+end
+
+
+end # TestTrixiMPIArrays
+
+end # module
diff --git a/test/test_mpi_p4est_2d.jl b/test/test_mpi_p4est_2d.jl
index 325767da068..9e6f1ebbeea 100644
--- a/test/test_mpi_p4est_2d.jl
+++ b/test/test_mpi_p4est_2d.jl
@@ -18,6 +18,21 @@ const EXAMPLES_DIR = joinpath(pathof(Trixi) |> dirname |> dirname, "examples", "
       # Expected errors are exactly the same as with TreeMesh!
       l2   = [8.311947673061856e-6],
       linf = [6.627000273229378e-5])
+
+    @testset "error-based step size control" begin
+      Trixi.mpi_isroot() && println("-"^100)
+      Trixi.mpi_isroot() && println("elixir_advection_basic.jl with error-based step size control")
+
+      sol = solve(ode, RDPK3SpFSAL35(), abstol=1.0e-4, reltol=1.0e-4,
+                  save_everystep=false, callback=callbacks,
+                  internalnorm=ode_norm,
+                  unstable_check=ode_unstable_check); summary_callback()
+      errors = analysis_callback(sol)
+      if Trixi.mpi_isroot()
+        @test errors.l2 ≈ [3.3022040342579066e-5]    rtol=1.0e-4
+        @test errors.linf ≈ [0.00011787417954578494] rtol=1.0e-4
+      end
+    end
   end
 
   @trixi_testset "elixir_advection_nonconforming_flag.jl" begin
diff --git a/test/test_mpi_p4est_3d.jl b/test/test_mpi_p4est_3d.jl
index 3488e1bfbd0..f7eb1574ccf 100644
--- a/test/test_mpi_p4est_3d.jl
+++ b/test/test_mpi_p4est_3d.jl
@@ -18,6 +18,21 @@ const EXAMPLES_DIR = joinpath(pathof(Trixi) |> dirname |> dirname, "examples", "
       # Expected errors are exactly the same as with TreeMesh!
       l2   = [0.00016263963870641478],
       linf = [0.0014537194925779984])
+
+    @testset "error-based step size control" begin
+      Trixi.mpi_isroot() && println("-"^100)
+      Trixi.mpi_isroot() && println("elixir_advection_basic.jl with error-based step size control")
+
+      sol = solve(ode, RDPK3SpFSAL35(), abstol=1.0e-4, reltol=1.0e-4,
+                  save_everystep=false, callback=callbacks,
+                  internalnorm=ode_norm,
+                  unstable_check=ode_unstable_check); summary_callback()
+      errors = analysis_callback(sol)
+      if Trixi.mpi_isroot()
+        @test errors.l2 ≈ [0.00016800412839949264]  rtol=1.0e-4
+        @test errors.linf ≈ [0.0014548839020096516] rtol=1.0e-4
+      end
+    end
   end
 
   @trixi_testset "elixir_advection_amr.jl" begin
diff --git a/test/test_mpi_tree.jl b/test/test_mpi_tree.jl
index d8f98f17c53..79ecd39287b 100644
--- a/test/test_mpi_tree.jl
+++ b/test/test_mpi_tree.jl
@@ -124,6 +124,21 @@ const EXAMPLES_DIR = joinpath(pathof(Trixi) |> dirname |> dirname, "examples", "
     @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_euler_ec.jl"),
       l2   = [0.061751715597716854, 0.05018223615408711, 0.05018989446443463, 0.225871559730513],
       linf = [0.29347582879608825, 0.31081249232844693, 0.3107380389947736, 1.0540358049885143])
+
+    @testset "error-based step size control" begin
+      Trixi.mpi_isroot() && println("-"^100)
+      Trixi.mpi_isroot() && println("elixir_euler_ec.jl with error-based step size control")
+
+      sol = solve(ode, RDPK3SpFSAL35(), abstol=1.0e-4, reltol=1.0e-4,
+                  save_everystep=false, callback=callbacks,
+                  internalnorm=ode_norm,
+                  unstable_check=ode_unstable_check); summary_callback()
+      errors = analysis_callback(sol)
+      if Trixi.mpi_isroot()
+        @test errors.l2   ≈ [0.061653630426688116, 0.05006930431098764, 0.05007694316484242, 0.22550689872331683] rtol=1.0e-4
+        @test errors.linf ≈ [0.28516937484583693, 0.2983633696512788, 0.297812036335975, 1.027368795517512]       rtol=1.0e-4
+      end
+    end
   end
 
   @trixi_testset "elixir_euler_vortex.jl" begin