Skip to content

Commit

Permalink
Merge branch 'main' into SutherlandsLaw
Browse files Browse the repository at this point in the history
  • Loading branch information
jlchan authored Mar 27, 2024
2 parents f9091d9 + 909abb4 commit 042b18e
Show file tree
Hide file tree
Showing 5 changed files with 40 additions and 59 deletions.
4 changes: 2 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "Trixi"
uuid = "a7f1ee26-1774-49b1-8366-f1abc58fbfcb"
authors = ["Michael Schlottke-Lakemper <[email protected]>", "Gregor Gassner <[email protected]>", "Hendrik Ranocha <[email protected]>", "Andrew R. Winters <[email protected]>", "Jesse Chan <[email protected]>"]
version = "0.7.5-pre"
version = "0.7.6-pre"

[deps]
CodeTracking = "da1fd8a2-8d9e-5ec2-8556-3022fb5608a2"
Expand Down Expand Up @@ -75,7 +75,7 @@ MuladdMacro = "0.2.2"
Octavian = "0.3.21"
OffsetArrays = "1.12"
P4est = "0.4.9"
Polyester = "0.7.5"
Polyester = "0.7.10"
PrecompileTools = "1.1"
Preferences = "1.3"
Printf = "1"
Expand Down
11 changes: 0 additions & 11 deletions docs/src/performance.md
Original file line number Diff line number Diff line change
Expand Up @@ -282,14 +282,3 @@ requires. It can thus be seen as a proxy for "energy used" and, as an extension,
timing result, you need to set the analysis interval such that the
`AnalysisCallback` is invoked at least once during the course of the simulation and
discard the first PID value.

## Performance issues with multi-threaded reductions
[False sharing](https://en.wikipedia.org/wiki/False_sharing) is a known performance issue
for systems with distributed caches. It also occurred for the implementation of a thread
parallel bounds checking routine for the subcell IDP limiting
in [PR #1736](https://github.com/trixi-framework/Trixi.jl/pull/1736).
After some [testing and discussion](https://github.com/trixi-framework/Trixi.jl/pull/1736#discussion_r1423881895),
it turned out that initializing a vector of length `n * Threads.nthreads()` and only using every
n-th entry instead of a vector of length `Threads.nthreads()` fixes the problem.
Since there are no processors with caches over 128B, we use `n = 128B / size(uEltype)`.
Now, the bounds checking routine of the IDP limiting scales as hoped.
66 changes: 29 additions & 37 deletions src/callbacks_stage/subcell_bounds_check_2d.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,35 +12,37 @@
(; variable_bounds) = limiter.cache.subcell_limiter_coefficients
(; idp_bounds_delta_local, idp_bounds_delta_global) = limiter.cache

# Note: Accessing the threaded memory vector `idp_bounds_delta_local` with
# `deviation = idp_bounds_delta_local[key][Threads.threadid()]` causes critical performance
# issues due to False Sharing.
# Initializing a vector with n times the length and using every n-th entry fixes this
# problem and allows proper scaling:
# `deviation = idp_bounds_delta_local[key][n * Threads.threadid()]`
# Since there are no processors with caches over 128B, we use `n = 128B / size(uEltype)`
stride_size = div(128, sizeof(eltype(u))) # = n
# Note: In order to get the maximum deviation from the target bounds, this bounds check
# requires a reduction in every RK stage and for every enabled limiting option. To make
# this Thread-parallel we are using Polyester.jl's (at least v0.7.10) `@batch reduction`
# functionality.
# Although `@threaded` and `@batch` are currently used equivalently in Trixi.jl, we use
# `@batch` here to allow a possible redefinition of `@threaded` without creating errors here.
# See also https://github.com/trixi-framework/Trixi.jl/pull/1888#discussion_r1537785293.

if local_minmax
for v in limiter.local_minmax_variables_cons
v_string = string(v)
key_min = Symbol(v_string, "_min")
key_max = Symbol(v_string, "_max")
deviation_min_threaded = idp_bounds_delta_local[key_min]
deviation_max_threaded = idp_bounds_delta_local[key_max]
@threaded for element in eachelement(solver, cache)
deviation_min = deviation_min_threaded[stride_size * Threads.threadid()]
deviation_max = deviation_max_threaded[stride_size * Threads.threadid()]
deviation_min = idp_bounds_delta_local[key_min]
deviation_max = idp_bounds_delta_local[key_max]
@batch reduction=((max, deviation_min), (max, deviation_max)) for element in eachelement(solver,
cache)
for j in eachnode(solver), i in eachnode(solver)
var = u[v, i, j, element]
# Note: We always save the absolute deviations >= 0 and therefore use the
# `max` operator for the lower and upper bound. The different directions of
# upper and lower bound are considered in their calculations with a
# different sign.
deviation_min = max(deviation_min,
variable_bounds[key_min][i, j, element] - var)
deviation_max = max(deviation_max,
var - variable_bounds[key_max][i, j, element])
end
deviation_min_threaded[stride_size * Threads.threadid()] = deviation_min
deviation_max_threaded[stride_size * Threads.threadid()] = deviation_max
end
idp_bounds_delta_local[key_min] = deviation_min
idp_bounds_delta_local[key_max] = deviation_max
end
end
if positivity
Expand All @@ -49,40 +51,35 @@
continue
end
key = Symbol(string(v), "_min")
deviation_threaded = idp_bounds_delta_local[key]
@threaded for element in eachelement(solver, cache)
deviation = deviation_threaded[stride_size * Threads.threadid()]
deviation = idp_bounds_delta_local[key]
@batch reduction=(max, deviation) for element in eachelement(solver, cache)
for j in eachnode(solver), i in eachnode(solver)
var = u[v, i, j, element]
deviation = max(deviation,
variable_bounds[key][i, j, element] - var)
end
deviation_threaded[stride_size * Threads.threadid()] = deviation
end
idp_bounds_delta_local[key] = deviation
end
for variable in limiter.positivity_variables_nonlinear
key = Symbol(string(variable), "_min")
deviation_threaded = idp_bounds_delta_local[key]
@threaded for element in eachelement(solver, cache)
deviation = deviation_threaded[stride_size * Threads.threadid()]
deviation = idp_bounds_delta_local[key]
@batch reduction=(max, deviation) for element in eachelement(solver, cache)
for j in eachnode(solver), i in eachnode(solver)
var = variable(get_node_vars(u, equations, solver, i, j, element),
equations)
deviation = max(deviation,
variable_bounds[key][i, j, element] - var)
end
deviation_threaded[stride_size * Threads.threadid()] = deviation
end
idp_bounds_delta_local[key] = deviation
end
end

for (key, _) in idp_bounds_delta_local
# Calculate maximum deviations of all threads
idp_bounds_delta_local[key][stride_size] = maximum(idp_bounds_delta_local[key][stride_size * i]
for i in 1:Threads.nthreads())
# Update global maximum deviations
idp_bounds_delta_global[key] = max(idp_bounds_delta_global[key],
idp_bounds_delta_local[key][stride_size])
idp_bounds_delta_local[key])
end

if save_errors
Expand All @@ -92,32 +89,27 @@
if local_minmax
for v in limiter.local_minmax_variables_cons
v_string = string(v)
print(f, ", ",
idp_bounds_delta_local[Symbol(v_string, "_min")][stride_size],
", ",
idp_bounds_delta_local[Symbol(v_string, "_max")][stride_size])
print(f, ", ", idp_bounds_delta_local[Symbol(v_string, "_min")],
", ", idp_bounds_delta_local[Symbol(v_string, "_max")])
end
end
if positivity
for v in limiter.positivity_variables_cons
if v in limiter.local_minmax_variables_cons
continue
end
print(f, ", ",
idp_bounds_delta_local[Symbol(string(v), "_min")][stride_size])
print(f, ", ", idp_bounds_delta_local[Symbol(string(v), "_min")])
end
for variable in limiter.positivity_variables_nonlinear
print(f, ", ",
idp_bounds_delta_local[Symbol(string(variable), "_min")][stride_size])
idp_bounds_delta_local[Symbol(string(variable), "_min")])
end
end
println(f)
end
# Reset local maximum deviations
for (key, _) in idp_bounds_delta_local
for i in 1:Threads.nthreads()
idp_bounds_delta_local[key][stride_size * i] = zero(eltype(idp_bounds_delta_local[key][stride_size]))
end
idp_bounds_delta_local[key] = zero(eltype(idp_bounds_delta_local[key]))
end
end

Expand Down
7 changes: 7 additions & 0 deletions src/callbacks_step/amr.jl
Original file line number Diff line number Diff line change
Expand Up @@ -138,11 +138,18 @@ function initialize!(cb::DiscreteCallback{Condition, Affect!}, u, t,
# iterate until mesh does not change anymore
has_changed = amr_callback(integrator,
only_refine = amr_callback.adapt_initial_condition_only_refine)
iterations = 1
while has_changed
compute_coefficients!(integrator.u, t, semi)
u_modified!(integrator, true)
has_changed = amr_callback(integrator,
only_refine = amr_callback.adapt_initial_condition_only_refine)
iterations = iterations + 1
if iterations > 10
@warn "AMR for initial condition did not settle within 10 iterations!\n" *
"Consider adjusting thresholds or setting `adapt_initial_condition_only_refine`."
break
end
end
end

Expand Down
11 changes: 2 additions & 9 deletions src/solvers/dgsem_tree/subcell_limiters_2d.jl
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,11 @@ function create_cache(limiter::Type{SubcellLimiterIDP}, equations::AbstractEquat

# Memory for bounds checking routine with `BoundsCheckCallback`.
# Local variable contains the maximum deviation since the last export.
# Using a threaded vector to parallelize bounds check.
idp_bounds_delta_local = Dict{Symbol, Vector{real(basis)}}()
idp_bounds_delta_local = Dict{Symbol, real(basis)}()
# Global variable contains the total maximum deviation.
idp_bounds_delta_global = Dict{Symbol, real(basis)}()
# Note: False sharing causes critical performance issues on multiple threads when using a vector
# of length `Threads.nthreads()`. Initializing a vector of length `n * Threads.nthreads()`
# and then only using every n-th entry, fixes the problem and allows proper scaling.
# Since there are no processors with caches over 128B, we use `n = 128B / size(uEltype)`
stride_size = div(128, sizeof(eltype(basis.nodes))) # = n
for key in bound_keys
idp_bounds_delta_local[key] = [zero(real(basis))
for _ in 1:(stride_size * Threads.nthreads())]
idp_bounds_delta_local[key] = zero(real(basis))
idp_bounds_delta_global[key] = zero(real(basis))
end

Expand Down

0 comments on commit 042b18e

Please sign in to comment.