From f8426716fedfd2fe1db97c4b2bda14b4b1f57a52 Mon Sep 17 00:00:00 2001 From: Dennis Yatunin Date: Fri, 11 Oct 2024 18:31:26 -0700 Subject: [PATCH] Improve compilation by using macros from Base.Cartesian --- .github/workflows/invalidations.yml | 2 +- docs/src/developer_guide.md | 13 ++- src/StaticBitVector.jl | 14 +-- src/UnrolledUtilities.jl | 16 ++-- src/generatively_unrolled_functions.jl | 127 ++++++++++++------------- src/unrollable_iterator_interface.jl | 13 +-- test/test_and_analyze.jl | 21 +--- 7 files changed, 89 insertions(+), 117 deletions(-) diff --git a/.github/workflows/invalidations.yml b/.github/workflows/invalidations.yml index 881ebec..7439f9f 100644 --- a/.github/workflows/invalidations.yml +++ b/.github/workflows/invalidations.yml @@ -18,7 +18,7 @@ jobs: steps: - uses: julia-actions/setup-julia@v1 with: - version: '1' + version: '1.10' # JET and SnoopCompile do not yet support Julia 1.11 - uses: actions/checkout@v4 - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-invalidations@v1 diff --git a/docs/src/developer_guide.md b/docs/src/developer_guide.md index 9ec99cd..1653c87 100644 --- a/docs/src/developer_guide.md +++ b/docs/src/developer_guide.md @@ -6,7 +6,7 @@ CurrentModule = UnrolledUtilities There are two general ways to implement loop unrolling in Julia—recursively splatting iterator contents and manually generating unrolled expressions. For -example, the recursively unrolled version of `foreach` is +example, a recursively unrolled version of the `foreach` function is ```julia unrolled_foreach(f, itr) = _unrolled_foreach(f, itr...) @@ -14,7 +14,7 @@ _unrolled_foreach(f) = nothing _unrolled_foreach(f, item, items...) = (f(item); _unrolled_foreach(f, items...)) ``` -In contrast, the generatively unrolled version of `foreach` is +In contrast, a generatively unrolled implementation of this function looks like ```julia unrolled_foreach(f, itr) = _unrolled_foreach(Val(length(itr)), f, itr) @@ -30,15 +30,14 @@ rec_unroll ``` !!! tip "Tip" - Recursive loop unrolling can be disabled globally with the following - function redefinition: + Recursive loop unrolling can be enabled by redefining this function: ```julia - rec_unroll(itr) = false + rec_unroll(itr) = true ``` -The cutoff length of 16 for switching to generative unrolling is motivated by -the benchmarks for [Generative vs. Recursive Unrolling](@ref). +The default choice of generative unrolling is motivated by the benchmarks for +[Generative vs. Recursive Unrolling](@ref). ## Interface API diff --git a/src/StaticBitVector.jl b/src/StaticBitVector.jl index d41bffa..3584624 100644 --- a/src/StaticBitVector.jl +++ b/src/StaticBitVector.jl @@ -34,8 +34,8 @@ end @inline first_index = n_bits_per_int * (int_index - 1) + 1 unrolled_reduce( - StaticOneTo(min(n_bits_per_int, N - first_index + 1)); - init = zero(U), + StaticOneTo(min(n_bits_per_int, N - first_index + 1)), + zero(U), ) do int, bit_index @inline bit_offset = bit_index - 1 @@ -93,15 +93,15 @@ end n_bits_per_int = 8 * sizeof(U) n_ints = cld(N, n_bits_per_int) ints = unrolled_accumulate( - StaticOneTo(n_ints); - init = (nothing, init), - transform = first, + StaticOneTo(n_ints), + (nothing, init), + first, ) do (_, init_value_for_new_int), int_index @inline first_index = n_bits_per_int * (int_index - 1) + 1 unrolled_reduce( - StaticOneTo(min(n_bits_per_int, N - first_index + 1)); - init = (zero(U), init_value_for_new_int), + StaticOneTo(min(n_bits_per_int, N - first_index + 1)), + (zero(U), init_value_for_new_int), ) do (int, prev_value), bit_index @inline bit_offset = bit_index - 1 diff --git a/src/UnrolledUtilities.jl b/src/UnrolledUtilities.jl index 56ba671..770064c 100644 --- a/src/UnrolledUtilities.jl +++ b/src/UnrolledUtilities.jl @@ -85,7 +85,7 @@ include("generatively_unrolled_functions.jl") constructor_from_tuple(output_type)( unrolled_accumulate_into_tuple(op, itr, init, transform), ) -@inline unrolled_accumulate(op, itr; init = NoInit(), transform = identity) = +@inline unrolled_accumulate(op, itr, init, transform) = unrolled_accumulate_into( accumulate_output_type(op, itr, init, transform), op, @@ -93,6 +93,8 @@ include("generatively_unrolled_functions.jl") init, transform, ) +@inline unrolled_accumulate(op, itr; init = NoInit(), transform = identity) = + unrolled_accumulate(op, itr, init, transform) @inline unrolled_push_into(output_type, itr, item) = constructor_from_tuple(output_type)((itr..., item)) @@ -122,22 +124,22 @@ include("generatively_unrolled_functions.jl") # Using === instead of == or isequal improves type stability for singletons. @inline unrolled_unique(itr) = - unrolled_reduce(itr; init = inferred_empty(itr)) do unique_items, item + unrolled_reduce(itr, inferred_empty(itr)) do unique_items, item @inline unrolled_in(item, unique_items) ? unique_items : unrolled_push(unique_items, item) end @inline unrolled_filter(f, itr) = - unrolled_reduce(itr; init = inferred_empty(itr)) do items_with_true_f, item + unrolled_reduce(itr, inferred_empty(itr)) do items_with_true_f, item @inline f(item) ? unrolled_push(items_with_true_f, item) : items_with_true_f end @inline unrolled_split(f, itr) = unrolled_reduce( - itr; - init = (inferred_empty(itr), inferred_empty(itr)), + itr, + (inferred_empty(itr), inferred_empty(itr)), ) do (items_with_true_f, items_with_false_f), item @inline f(item) ? (unrolled_push(items_with_true_f, item), items_with_false_f) : @@ -145,13 +147,13 @@ include("generatively_unrolled_functions.jl") end @inline unrolled_flatten(itr) = - unrolled_reduce(unrolled_append, itr; init = promoted_empty(itr)) + unrolled_reduce(unrolled_append, itr, promoted_empty(itr)) @inline unrolled_flatmap(f, itrs...) = unrolled_flatten(Iterators.map(f, itrs...)) @inline unrolled_product(itrs...) = - unrolled_reduce(itrs; init = (promoted_empty(itrs),)) do product_itr, itr + unrolled_reduce(itrs, (promoted_empty(itrs),)) do product_itr, itr @inline unrolled_flatmap(itr) do item @inline diff --git a/src/generatively_unrolled_functions.jl b/src/generatively_unrolled_functions.jl index 45cb9fd..af55f93 100644 --- a/src/generatively_unrolled_functions.jl +++ b/src/generatively_unrolled_functions.jl @@ -1,93 +1,88 @@ -@generated _gen_unrolled_any(::Val{N}, f, itr) where {N} = Expr( - :block, - Expr(:meta, :inline), - Expr(:||, (:(f(generic_getindex(itr, $n))) for n in 1:N)...), -) +@generated _gen_unrolled_any(::Val{N}, f, itr) where {N} = quote + @inline + return Base.Cartesian.@nany $N n -> f(generic_getindex(itr, n)) +end @inline gen_unrolled_any(f, itr) = _gen_unrolled_any(Val(length(itr)), f, itr) -@generated _gen_unrolled_all(::Val{N}, f, itr) where {N} = Expr( - :block, - Expr(:meta, :inline), - Expr(:&&, (:(f(generic_getindex(itr, $n))) for n in 1:N)...), -) +@generated _gen_unrolled_all(::Val{N}, f, itr) where {N} = quote + @inline + return Base.Cartesian.@nall $N n -> f(generic_getindex(itr, n)) +end @inline gen_unrolled_all(f, itr) = _gen_unrolled_all(Val(length(itr)), f, itr) -@generated _gen_unrolled_foreach(::Val{N}, f, itr) where {N} = Expr( - :block, - Expr(:meta, :inline), - (:(f(generic_getindex(itr, $n))) for n in 1:N)..., - nothing, -) +@generated _gen_unrolled_foreach(::Val{N}, f, itr) where {N} = quote + @inline + Base.Cartesian.@nexprs $N n -> f(generic_getindex(itr, n)) + return nothing +end @inline gen_unrolled_foreach(f, itr) = _gen_unrolled_foreach(Val(length(itr)), f, itr) -@generated _gen_unrolled_map(::Val{N}, f, itr) where {N} = Expr( - :block, - Expr(:meta, :inline), - Expr(:tuple, (:(f(generic_getindex(itr, $n))) for n in 1:N)...), -) +@generated _gen_unrolled_map(::Val{N}, f, itr) where {N} = quote + @inline + return Base.Cartesian.@ntuple $N n -> f(generic_getindex(itr, n)) +end @inline gen_unrolled_map(f, itr) = _gen_unrolled_map(Val(length(itr)), f, itr) -@generated _gen_unrolled_applyat(::Val{N}, f, n, itr) where {N} = Expr( - :block, - Expr(:meta, :inline), - (:(n == $n && return f(generic_getindex(itr, $n))) for n in 1:N)..., - :(unrolled_applyat_bounds_error()), -) # This block gets optimized into a switch instruction during LLVM codegen. +@generated _gen_unrolled_applyat(::Val{N}, f, n′, itr) where {N} = quote + @inline + Base.Cartesian.@nexprs $N n -> + (n′ == n && return f(generic_getindex(itr, n))) + unrolled_applyat_bounds_error() +end # This is optimized into a switch instruction during LLVM code generation. @inline gen_unrolled_applyat(f, n, itr) = _gen_unrolled_applyat(Val(length(itr)), f, n, itr) -@generated _gen_unrolled_reduce(::Val{N}, op, itr, init) where {N} = Expr( - :block, - Expr(:meta, :inline), - foldl( - (op_expr, n) -> :(op($op_expr, generic_getindex(itr, $n))), - (init <: NoInit ? 2 : 1):N; - init = init <: NoInit ? :(generic_getindex(itr, 1)) : :init, - ), # Use foldl instead of reduce to guarantee left associativity. -) +@generated _gen_unrolled_reduce(::Val{N}, op, itr, init) where {N} = quote + @inline + value_0 = init + $N == 0 && return value_0 + return Base.Cartesian.@nexprs $N n -> + (value_n = op(value_{n - 1}, generic_getindex(itr, n))) +end +@generated _gen_unrolled_reduce(::Val{N}, op, itr, ::NoInit) where {N} = quote + @inline + value_1 = generic_getindex(itr, 1) + $N == 1 && return value_1 + return Base.Cartesian.@nexprs $(N - 1) n -> + (value_{n + 1} = op(value_n, generic_getindex(itr, n + 1))) +end @inline gen_unrolled_reduce(op, itr, init) = _gen_unrolled_reduce(Val(length(itr)), op, itr, init) -@generated function _gen_unrolled_accumulate( +@generated _gen_unrolled_accumulate( ::Val{N}, op, itr, init, transform, -) where {N} - first_item_expr = :(generic_getindex(itr, 1)) - init_expr = init <: NoInit ? first_item_expr : :(op(init, $first_item_expr)) - transformed_exprs_and_op_exprs = - accumulate(1:N; init = (nothing, init_expr)) do (_, op_expr), n - var = gensym() - next_op_expr = :(op($var, generic_getindex(itr, $(n + 1)))) - (:($var = $op_expr; transform($var)), next_op_expr) - end - return Expr( - :block, - Expr(:meta, :inline), - Expr(:tuple, Iterators.map(first, transformed_exprs_and_op_exprs)...), - ) +) where {N} = quote + @inline + $N == 0 && return () + first_itr_item = generic_getindex(itr, 1) + value_1 = init isa NoInit ? first_itr_item : op(init, first_itr_item) + Base.Cartesian.@nexprs $(N - 1) n -> + (value_{n + 1} = op(value_n, generic_getindex(itr, n + 1))) + return Base.Cartesian.@ntuple $N n -> transform(value_n) end @inline gen_unrolled_accumulate(op, itr, init, transform) = _gen_unrolled_accumulate(Val(length(itr)), op, itr, init, transform) # TODO: The following is experimental and will likely be removed in the future. -# For some reason, combining these two methods into one (or combining them with -# the method for gen_unrolled_reduce defined above) causes compilation of the -# non-orographic gravity wave parametrization test in ClimaAtmos to hang. -# Wrapping the first method's result in a block and adding an inline annotation -# also causes compilation to hang. Even using the assignment form of the first -# method definition below (as opposed to the function syntax used here) causes -# it to hang. This has not yet been replicated in a minimal working example. +# For some reason, passing a StaticOneTo of length 256 to the methods for +# gen_unrolled_reduce defined above causes compilation of the non-orographic +# gravity wave parametrization test in ClimaAtmos to hang. Passing a Val(256) to +# these methods for val_unrolled_reduce prevents compilation from hanging. +# However, writing these methods in the same style as gen_unrolled_reduce (with +# the result wrapped in a block with an @inline annotation) still causes +# compilation to hang. Even using the assignment form of the first method +# definition below (as opposed to the function syntax used here) causes it to +# hang. This has not yet been replicated in a minimal working example. @generated function val_unrolled_reduce(op, ::Val{N}, init) where {N} - return foldl((:init, 1:N...)) do prev_op_expr, item_expr - :(op($prev_op_expr, $item_expr)) - end + return :(Base.Cartesian.@nexprs $(N + 1) n -> + (value_n = n == 1 ? init : op(value_{n - 1}, n - 1))) +end +@generated function val_unrolled_reduce(op, ::Val{N}, ::NoInit) where {N} + return :(Base.Cartesian.@nexprs $N n -> + (value_n = n == 1 ? 1 : op(value_{n - 1}, n))) end -@generated val_unrolled_reduce(op, ::Val{N}, ::NoInit) where {N} = Expr( - :block, - Expr(:meta, :inline), - foldl((op_expr, item_expr) -> :(op($op_expr, $item_expr)), 1:N), -) diff --git a/src/unrollable_iterator_interface.jl b/src/unrollable_iterator_interface.jl index f17705f..dde3470 100644 --- a/src/unrollable_iterator_interface.jl +++ b/src/unrollable_iterator_interface.jl @@ -2,17 +2,10 @@ rec_unroll(itr) Whether to use recursive loop unrolling instead of generative loop unrolling for -the iterator `itr`. - -In general, recursive loop unrolling is faster to compile for small iterators, -but it becomes extremely slow to compile for long iterators, and it usually -generates suboptimal LLVM code for long iterators. On the other hand, generative -loop unrolling is slow to compile for small iterators, but its compilation time -does not grow as rapidly with respect to iterator size, and it always generates -optimal LLVM code. The default is currently to use recursive unrolling for -iterator lengths up to 16, and to use generative unrolling for longer iterators. +the iterator `itr`. Recursive unrolling can lead to suboptimal LLVM code for +iterators of more than 32 items, so this is set to `false` by default. """ -@inline rec_unroll(itr) = length(itr) <= 16 +@inline rec_unroll(itr) = false """ generic_getindex(itr, n) diff --git a/test/test_and_analyze.jl b/test/test_and_analyze.jl index 400746e..e269ec0 100644 --- a/test/test_and_analyze.jl +++ b/test/test_and_analyze.jl @@ -877,7 +877,7 @@ title = "Very Long Iterators" comparison_table_dict = (comparison_table_dicts[title] = OrderedDict()) @testset "unrolled functions of Tuples vs. StaticOneTos" begin - for itr in (ntuple(identity, 2000), StaticOneTo(2000), StaticOneTo(8185)) + for itr in (ntuple(identity, 2000), StaticOneTo(2000), StaticOneTo(9000)) @test_unrolled (itr,) unrolled_reduce(+, itr) reduce(+, itr) "Ints" @test_unrolled( (itr,), @@ -885,24 +885,7 @@ comparison_table_dict = (comparison_table_dicts[title] = OrderedDict()) mapreduce(log, +, itr), "Ints", ) - end # These can each take 40 seconds to compile for ntuple(identity, 8185). - for itr in (ntuple(identity, 8186), StaticOneTo(8186)) - @test_throws "gc handles" unrolled_reduce(+, itr) - @test_throws "gc handles" unrolled_mapreduce(log, +, itr) - end - # TODO: Why does the compiler throw an error when generating functions that - # get unrolled into more than 8185 lines of LLVM code? - - for itr in (StaticOneTo(8185), StaticOneTo(8186)) - @test_unrolled( - (itr,), - unrolled_reduce(+, Val(length(itr))), - reduce(+, itr), - "Ints", - ) - end - @test_throws "gc handles" unrolled_reduce(+, Val(8188)) - # TODO: Why is the limit 8186 for the Val version of unrolled_reduce? + end # These can take over a minute to compile for ntuple(identity, 9000). end title = "Generative vs. Recursive Unrolling"