Skip to content

Commit

Permalink
Improve compilation by using macros from Base.Cartesian
Browse files Browse the repository at this point in the history
  • Loading branch information
dennisYatunin committed Oct 14, 2024
1 parent 3d90cc3 commit b28ccbb
Show file tree
Hide file tree
Showing 7 changed files with 73 additions and 130 deletions.
13 changes: 6 additions & 7 deletions docs/src/developer_guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,15 @@ CurrentModule = UnrolledUtilities

There are two general ways to implement loop unrolling in Julia—recursively
splatting iterator contents and manually generating unrolled expressions. For
example, the recursively unrolled version of `foreach` is
example, a recursively unrolled version of the `foreach` function is

```julia
unrolled_foreach(f, itr) = _unrolled_foreach(f, itr...)
_unrolled_foreach(f) = nothing
_unrolled_foreach(f, item, items...) = (f(item); _unrolled_foreach(f, items...))
```

In contrast, the generatively unrolled version of `foreach` is
In contrast, a generatively unrolled implementation of this function looks like

```julia
unrolled_foreach(f, itr) = _unrolled_foreach(Val(length(itr)), f, itr)
Expand All @@ -30,15 +30,14 @@ rec_unroll
```

!!! tip "Tip"
Recursive loop unrolling can be disabled globally with the following
function redefinition:
Recursive loop unrolling can be enabled by redefining this function:

```julia
rec_unroll(itr) = false
rec_unroll(itr) = true
```

The cutoff length of 16 for switching to generative unrolling is motivated by
the benchmarks for [Generative vs. Recursive Unrolling](@ref).
The default choice of generative unrolling is motivated by the benchmarks for
[Generative vs. Recursive Unrolling](@ref).

## Interface API

Expand Down
14 changes: 7 additions & 7 deletions src/StaticBitVector.jl
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ end
@inline
first_index = n_bits_per_int * (int_index - 1) + 1
unrolled_reduce(
StaticOneTo(min(n_bits_per_int, N - first_index + 1));
init = zero(U),
StaticOneTo(min(n_bits_per_int, N - first_index + 1)),
zero(U),
) do int, bit_index
@inline
bit_offset = bit_index - 1
Expand Down Expand Up @@ -93,15 +93,15 @@ end
n_bits_per_int = 8 * sizeof(U)
n_ints = cld(N, n_bits_per_int)
ints = unrolled_accumulate(
StaticOneTo(n_ints);
init = (nothing, init),
transform = first,
StaticOneTo(n_ints),
(nothing, init),
first,
) do (_, init_value_for_new_int), int_index
@inline
first_index = n_bits_per_int * (int_index - 1) + 1
unrolled_reduce(
StaticOneTo(min(n_bits_per_int, N - first_index + 1));
init = (zero(U), init_value_for_new_int),
StaticOneTo(min(n_bits_per_int, N - first_index + 1)),
(zero(U), init_value_for_new_int),
) do (int, prev_value), bit_index
@inline
bit_offset = bit_index - 1
Expand Down
25 changes: 9 additions & 16 deletions src/UnrolledUtilities.jl
Original file line number Diff line number Diff line change
Expand Up @@ -62,15 +62,6 @@ include("generatively_unrolled_functions.jl")
@inline unrolled_reduce(op, itr; init = NoInit()) =
unrolled_reduce(op, itr, init)

# TODO: Figure out why unrolled_reduce(op, Val(N), init) compiles faster than
# unrolled_reduce(op, StaticOneTo(N), init) for the non-orographic gravity wave
# parametrization test in ClimaAtmos, to the point where the StaticOneTo version
# completely hangs while the Val version compiles in only a few seconds.
@inline unrolled_reduce(op, val_N::Val, init) =
val_N isa Val{0} && init isa NoInit ?
error("unrolled_reduce requires an init value for Val(0)") :
val_unrolled_reduce(op, val_N, init)

@inline unrolled_mapreduce(f, op, itrs...; init = NoInit()) =
unrolled_reduce(op, Iterators.map(f, itrs...), init)

Expand All @@ -85,14 +76,16 @@ include("generatively_unrolled_functions.jl")
constructor_from_tuple(output_type)(
unrolled_accumulate_into_tuple(op, itr, init, transform),
)
@inline unrolled_accumulate(op, itr; init = NoInit(), transform = identity) =
@inline unrolled_accumulate(op, itr, init, transform) =
unrolled_accumulate_into(
accumulate_output_type(op, itr, init, transform),
op,
itr,
init,
transform,
)
@inline unrolled_accumulate(op, itr; init = NoInit(), transform = identity) =
unrolled_accumulate(op, itr, init, transform)

@inline unrolled_push_into(output_type, itr, item) =
constructor_from_tuple(output_type)((itr..., item))
Expand Down Expand Up @@ -122,36 +115,36 @@ include("generatively_unrolled_functions.jl")
# Using === instead of == or isequal improves type stability for singletons.

@inline unrolled_unique(itr) =
unrolled_reduce(itr; init = inferred_empty(itr)) do unique_items, item
unrolled_reduce(itr, inferred_empty(itr)) do unique_items, item
@inline
unrolled_in(item, unique_items) ? unique_items :
unrolled_push(unique_items, item)
end

@inline unrolled_filter(f, itr) =
unrolled_reduce(itr; init = inferred_empty(itr)) do items_with_true_f, item
unrolled_reduce(itr, inferred_empty(itr)) do items_with_true_f, item
@inline
f(item) ? unrolled_push(items_with_true_f, item) : items_with_true_f
end

@inline unrolled_split(f, itr) =
unrolled_reduce(
itr;
init = (inferred_empty(itr), inferred_empty(itr)),
itr,
(inferred_empty(itr), inferred_empty(itr)),
) do (items_with_true_f, items_with_false_f), item
@inline
f(item) ? (unrolled_push(items_with_true_f, item), items_with_false_f) :
(items_with_true_f, unrolled_push(items_with_false_f, item))
end

@inline unrolled_flatten(itr) =
unrolled_reduce(unrolled_append, itr; init = promoted_empty(itr))
unrolled_reduce(unrolled_append, itr, promoted_empty(itr))

@inline unrolled_flatmap(f, itrs...) =
unrolled_flatten(Iterators.map(f, itrs...))

@inline unrolled_product(itrs...) =
unrolled_reduce(itrs; init = (promoted_empty(itrs),)) do product_itr, itr
unrolled_reduce(itrs, (promoted_empty(itrs),)) do product_itr, itr
@inline
unrolled_flatmap(itr) do item
@inline
Expand Down
116 changes: 46 additions & 70 deletions src/generatively_unrolled_functions.jl
Original file line number Diff line number Diff line change
@@ -1,93 +1,69 @@
@generated _gen_unrolled_any(::Val{N}, f, itr) where {N} = Expr(
:block,
Expr(:meta, :inline),
Expr(:||, (:(f(generic_getindex(itr, $n))) for n in 1:N)...),
)
@generated _gen_unrolled_any(::Val{N}, f, itr) where {N} = quote
@inline
return Base.Cartesian.@nany $N n -> f(generic_getindex(itr, n))
end
@inline gen_unrolled_any(f, itr) = _gen_unrolled_any(Val(length(itr)), f, itr)

@generated _gen_unrolled_all(::Val{N}, f, itr) where {N} = Expr(
:block,
Expr(:meta, :inline),
Expr(:&&, (:(f(generic_getindex(itr, $n))) for n in 1:N)...),
)
@generated _gen_unrolled_all(::Val{N}, f, itr) where {N} = quote
@inline
return Base.Cartesian.@nall $N n -> f(generic_getindex(itr, n))
end
@inline gen_unrolled_all(f, itr) = _gen_unrolled_all(Val(length(itr)), f, itr)

@generated _gen_unrolled_foreach(::Val{N}, f, itr) where {N} = Expr(
:block,
Expr(:meta, :inline),
(:(f(generic_getindex(itr, $n))) for n in 1:N)...,
nothing,
)
@generated _gen_unrolled_foreach(::Val{N}, f, itr) where {N} = quote
@inline
Base.Cartesian.@nexprs $N n -> f(generic_getindex(itr, n))
return nothing
end
@inline gen_unrolled_foreach(f, itr) =
_gen_unrolled_foreach(Val(length(itr)), f, itr)

@generated _gen_unrolled_map(::Val{N}, f, itr) where {N} = Expr(
:block,
Expr(:meta, :inline),
Expr(:tuple, (:(f(generic_getindex(itr, $n))) for n in 1:N)...),
)
@generated _gen_unrolled_map(::Val{N}, f, itr) where {N} = quote
@inline
return Base.Cartesian.@ntuple $N n -> f(generic_getindex(itr, n))
end
@inline gen_unrolled_map(f, itr) = _gen_unrolled_map(Val(length(itr)), f, itr)

@generated _gen_unrolled_applyat(::Val{N}, f, n, itr) where {N} = Expr(
:block,
Expr(:meta, :inline),
(:(n == $n && return f(generic_getindex(itr, $n))) for n in 1:N)...,
:(unrolled_applyat_bounds_error()),
) # This block gets optimized into a switch instruction during LLVM codegen.
@generated _gen_unrolled_applyat(::Val{N}, f, n, itr) where {N} = quote
@inline
Base.Cartesian.@nexprs $N n ->
(n′ == n && return f(generic_getindex(itr, n)))
unrolled_applyat_bounds_error()
end # This is optimized into a switch instruction during LLVM code generation.
@inline gen_unrolled_applyat(f, n, itr) =
_gen_unrolled_applyat(Val(length(itr)), f, n, itr)

@generated _gen_unrolled_reduce(::Val{N}, op, itr, init) where {N} = Expr(
:block,
Expr(:meta, :inline),
foldl(
(op_expr, n) -> :(op($op_expr, generic_getindex(itr, $n))),
(init <: NoInit ? 2 : 1):N;
init = init <: NoInit ? :(generic_getindex(itr, 1)) : :init,
), # Use foldl instead of reduce to guarantee left associativity.
)
@generated _gen_unrolled_reduce(::Val{N}, op, itr, init) where {N} = quote
@inline
value_0 = init
$N == 0 && return value_0
return Base.Cartesian.@nexprs $N n ->
(value_n = op(value_{n - 1}, generic_getindex(itr, n)))
end
@generated _gen_unrolled_reduce(::Val{N}, op, itr, ::NoInit) where {N} = quote
@inline
value_1 = generic_getindex(itr, 1)
$N == 1 && return value_1
return Base.Cartesian.@nexprs $(N - 1) n ->
(value_{n + 1} = op(value_n, generic_getindex(itr, n + 1)))
end
@inline gen_unrolled_reduce(op, itr, init) =
_gen_unrolled_reduce(Val(length(itr)), op, itr, init)

@generated function _gen_unrolled_accumulate(
@generated _gen_unrolled_accumulate(
::Val{N},
op,
itr,
init,
transform,
) where {N}
first_item_expr = :(generic_getindex(itr, 1))
init_expr = init <: NoInit ? first_item_expr : :(op(init, $first_item_expr))
transformed_exprs_and_op_exprs =
accumulate(1:N; init = (nothing, init_expr)) do (_, op_expr), n
var = gensym()
next_op_expr = :(op($var, generic_getindex(itr, $(n + 1))))
(:($var = $op_expr; transform($var)), next_op_expr)
end
return Expr(
:block,
Expr(:meta, :inline),
Expr(:tuple, Iterators.map(first, transformed_exprs_and_op_exprs)...),
)
) where {N} = quote
@inline
$N == 0 && return ()
first_itr_item = generic_getindex(itr, 1)
value_1 = init isa NoInit ? first_itr_item : op(init, first_itr_item)
Base.Cartesian.@nexprs $(N - 1) n ->
(value_{n + 1} = op(value_n, generic_getindex(itr, n + 1)))
return Base.Cartesian.@ntuple $N n -> transform(value_n)
end
@inline gen_unrolled_accumulate(op, itr, init, transform) =
_gen_unrolled_accumulate(Val(length(itr)), op, itr, init, transform)

# TODO: The following is experimental and will likely be removed in the future.
# For some reason, combining these two methods into one (or combining them with
# the method for gen_unrolled_reduce defined above) causes compilation of the
# non-orographic gravity wave parametrization test in ClimaAtmos to hang.
# Wrapping the first method's result in a block and adding an inline annotation
# also causes compilation to hang. Even using the assignment form of the first
# method definition below (as opposed to the function syntax used here) causes
# it to hang. This has not yet been replicated in a minimal working example.
@generated function val_unrolled_reduce(op, ::Val{N}, init) where {N}
return foldl((:init, 1:N...)) do prev_op_expr, item_expr
:(op($prev_op_expr, $item_expr))
end
end
@generated val_unrolled_reduce(op, ::Val{N}, ::NoInit) where {N} = Expr(
:block,
Expr(:meta, :inline),
foldl((op_expr, item_expr) -> :(op($op_expr, $item_expr)), 1:N),
)
1 change: 0 additions & 1 deletion src/recursion_limits.jl
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
gen_unrolled_applyat,
gen_unrolled_reduce,
gen_unrolled_accumulate,
val_unrolled_reduce,
unrolled_any,
unrolled_all,
unrolled_foreach,
Expand Down
13 changes: 3 additions & 10 deletions src/unrollable_iterator_interface.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,10 @@
rec_unroll(itr)
Whether to use recursive loop unrolling instead of generative loop unrolling for
the iterator `itr`.
In general, recursive loop unrolling is faster to compile for small iterators,
but it becomes extremely slow to compile for long iterators, and it usually
generates suboptimal LLVM code for long iterators. On the other hand, generative
loop unrolling is slow to compile for small iterators, but its compilation time
does not grow as rapidly with respect to iterator size, and it always generates
optimal LLVM code. The default is currently to use recursive unrolling for
iterator lengths up to 16, and to use generative unrolling for longer iterators.
the iterator `itr`. Recursive unrolling can lead to suboptimal LLVM code for
iterators of more than 32 items, so this is set to `false` by default.
"""
@inline rec_unroll(itr) = length(itr) <= 16
@inline rec_unroll(itr) = false

"""
generic_getindex(itr, n)
Expand Down
21 changes: 2 additions & 19 deletions test/test_and_analyze.jl
Original file line number Diff line number Diff line change
Expand Up @@ -877,32 +877,15 @@ title = "Very Long Iterators"
comparison_table_dict = (comparison_table_dicts[title] = OrderedDict())

@testset "unrolled functions of Tuples vs. StaticOneTos" begin
for itr in (ntuple(identity, 2000), StaticOneTo(2000), StaticOneTo(8185))
for itr in (ntuple(identity, 2000), StaticOneTo(2000), StaticOneTo(9000))
@test_unrolled (itr,) unrolled_reduce(+, itr) reduce(+, itr) "Ints"
@test_unrolled(
(itr,),
unrolled_mapreduce(log, +, itr),
mapreduce(log, +, itr),
"Ints",
)
end # These can each take 40 seconds to compile for ntuple(identity, 8185).
for itr in (ntuple(identity, 8186), StaticOneTo(8186))
@test_throws "gc handles" unrolled_reduce(+, itr)
@test_throws "gc handles" unrolled_mapreduce(log, +, itr)
end
# TODO: Why does the compiler throw an error when generating functions that
# get unrolled into more than 8185 lines of LLVM code?

for itr in (StaticOneTo(8185), StaticOneTo(8186))
@test_unrolled(
(itr,),
unrolled_reduce(+, Val(length(itr))),
reduce(+, itr),
"Ints",
)
end
@test_throws "gc handles" unrolled_reduce(+, Val(8188))
# TODO: Why is the limit 8186 for the Val version of unrolled_reduce?
end # These can take over a minute to compile for ntuple(identity, 9000).
end

title = "Generative vs. Recursive Unrolling"
Expand Down

0 comments on commit b28ccbb

Please sign in to comment.