From b28ccbbc945c6824e69bcaa6142c18e76507a6a3 Mon Sep 17 00:00:00 2001
From: Dennis Yatunin <dyatun@gmail.com>
Date: Fri, 11 Oct 2024 18:31:26 -0700
Subject: [PATCH] Improve compilation by using macros from Base.Cartesian

---
 docs/src/developer_guide.md            |  13 ++-
 src/StaticBitVector.jl                 |  14 +--
 src/UnrolledUtilities.jl               |  25 ++----
 src/generatively_unrolled_functions.jl | 116 ++++++++++---------------
 src/recursion_limits.jl                |   1 -
 src/unrollable_iterator_interface.jl   |  13 +--
 test/test_and_analyze.jl               |  21 +----
 7 files changed, 73 insertions(+), 130 deletions(-)

diff --git a/docs/src/developer_guide.md b/docs/src/developer_guide.md
index 9ec99cd..1653c87 100644
--- a/docs/src/developer_guide.md
+++ b/docs/src/developer_guide.md
@@ -6,7 +6,7 @@ CurrentModule = UnrolledUtilities
 
 There are two general ways to implement loop unrolling in Julia—recursively
 splatting iterator contents and manually generating unrolled expressions. For
-example, the recursively unrolled version of `foreach` is
+example, a recursively unrolled version of the `foreach` function is
 
 ```julia
 unrolled_foreach(f, itr) = _unrolled_foreach(f, itr...)
@@ -14,7 +14,7 @@ _unrolled_foreach(f) = nothing
 _unrolled_foreach(f, item, items...) = (f(item); _unrolled_foreach(f, items...))
 ```
 
-In contrast, the generatively unrolled version of `foreach` is
+In contrast, a generatively unrolled implementation of this function looks like
 
 ```julia
 unrolled_foreach(f, itr) = _unrolled_foreach(Val(length(itr)), f, itr)
@@ -30,15 +30,14 @@ rec_unroll
 ```
 
 !!! tip "Tip"
-    Recursive loop unrolling can be disabled globally with the following
-    function redefinition:
+    Recursive loop unrolling can be enabled by redefining this function:
 
     ```julia
-    rec_unroll(itr) = false
+    rec_unroll(itr) = true
     ```
 
-The cutoff length of 16 for switching to generative unrolling is motivated by
-the benchmarks for [Generative vs. Recursive Unrolling](@ref).
+The default choice of generative unrolling is motivated by the benchmarks for
+[Generative vs. Recursive Unrolling](@ref).
 
 ## Interface API
 
diff --git a/src/StaticBitVector.jl b/src/StaticBitVector.jl
index d41bffa..3584624 100644
--- a/src/StaticBitVector.jl
+++ b/src/StaticBitVector.jl
@@ -34,8 +34,8 @@ end
         @inline
         first_index = n_bits_per_int * (int_index - 1) + 1
         unrolled_reduce(
-            StaticOneTo(min(n_bits_per_int, N - first_index + 1));
-            init = zero(U),
+            StaticOneTo(min(n_bits_per_int, N - first_index + 1)),
+            zero(U),
         ) do int, bit_index
             @inline
             bit_offset = bit_index - 1
@@ -93,15 +93,15 @@ end
     n_bits_per_int = 8 * sizeof(U)
     n_ints = cld(N, n_bits_per_int)
     ints = unrolled_accumulate(
-        StaticOneTo(n_ints);
-        init = (nothing, init),
-        transform = first,
+        StaticOneTo(n_ints),
+        (nothing, init),
+        first,
     ) do (_, init_value_for_new_int), int_index
         @inline
         first_index = n_bits_per_int * (int_index - 1) + 1
         unrolled_reduce(
-            StaticOneTo(min(n_bits_per_int, N - first_index + 1));
-            init = (zero(U), init_value_for_new_int),
+            StaticOneTo(min(n_bits_per_int, N - first_index + 1)),
+            (zero(U), init_value_for_new_int),
         ) do (int, prev_value), bit_index
             @inline
             bit_offset = bit_index - 1
diff --git a/src/UnrolledUtilities.jl b/src/UnrolledUtilities.jl
index 56ba671..2929de7 100644
--- a/src/UnrolledUtilities.jl
+++ b/src/UnrolledUtilities.jl
@@ -62,15 +62,6 @@ include("generatively_unrolled_functions.jl")
 @inline unrolled_reduce(op, itr; init = NoInit()) =
     unrolled_reduce(op, itr, init)
 
-# TODO: Figure out why unrolled_reduce(op, Val(N), init) compiles faster than
-# unrolled_reduce(op, StaticOneTo(N), init) for the non-orographic gravity wave
-# parametrization test in ClimaAtmos, to the point where the StaticOneTo version
-# completely hangs while the Val version compiles in only a few seconds.
-@inline unrolled_reduce(op, val_N::Val, init) =
-    val_N isa Val{0} && init isa NoInit ?
-    error("unrolled_reduce requires an init value for Val(0)") :
-    val_unrolled_reduce(op, val_N, init)
-
 @inline unrolled_mapreduce(f, op, itrs...; init = NoInit()) =
     unrolled_reduce(op, Iterators.map(f, itrs...), init)
 
@@ -85,7 +76,7 @@ include("generatively_unrolled_functions.jl")
     constructor_from_tuple(output_type)(
         unrolled_accumulate_into_tuple(op, itr, init, transform),
     )
-@inline unrolled_accumulate(op, itr; init = NoInit(), transform = identity) =
+@inline unrolled_accumulate(op, itr, init, transform) =
     unrolled_accumulate_into(
         accumulate_output_type(op, itr, init, transform),
         op,
@@ -93,6 +84,8 @@ include("generatively_unrolled_functions.jl")
         init,
         transform,
     )
+@inline unrolled_accumulate(op, itr; init = NoInit(), transform = identity) =
+    unrolled_accumulate(op, itr, init, transform)
 
 @inline unrolled_push_into(output_type, itr, item) =
     constructor_from_tuple(output_type)((itr..., item))
@@ -122,22 +115,22 @@ include("generatively_unrolled_functions.jl")
 # Using === instead of == or isequal improves type stability for singletons.
 
 @inline unrolled_unique(itr) =
-    unrolled_reduce(itr; init = inferred_empty(itr)) do unique_items, item
+    unrolled_reduce(itr, inferred_empty(itr)) do unique_items, item
         @inline
         unrolled_in(item, unique_items) ? unique_items :
         unrolled_push(unique_items, item)
     end
 
 @inline unrolled_filter(f, itr) =
-    unrolled_reduce(itr; init = inferred_empty(itr)) do items_with_true_f, item
+    unrolled_reduce(itr, inferred_empty(itr)) do items_with_true_f, item
         @inline
         f(item) ? unrolled_push(items_with_true_f, item) : items_with_true_f
     end
 
 @inline unrolled_split(f, itr) =
     unrolled_reduce(
-        itr;
-        init = (inferred_empty(itr), inferred_empty(itr)),
+        itr,
+        (inferred_empty(itr), inferred_empty(itr)),
     ) do (items_with_true_f, items_with_false_f), item
         @inline
         f(item) ? (unrolled_push(items_with_true_f, item), items_with_false_f) :
@@ -145,13 +138,13 @@ include("generatively_unrolled_functions.jl")
     end
 
 @inline unrolled_flatten(itr) =
-    unrolled_reduce(unrolled_append, itr; init = promoted_empty(itr))
+    unrolled_reduce(unrolled_append, itr, promoted_empty(itr))
 
 @inline unrolled_flatmap(f, itrs...) =
     unrolled_flatten(Iterators.map(f, itrs...))
 
 @inline unrolled_product(itrs...) =
-    unrolled_reduce(itrs; init = (promoted_empty(itrs),)) do product_itr, itr
+    unrolled_reduce(itrs, (promoted_empty(itrs),)) do product_itr, itr
         @inline
         unrolled_flatmap(itr) do item
             @inline
diff --git a/src/generatively_unrolled_functions.jl b/src/generatively_unrolled_functions.jl
index 45cb9fd..d68e2d9 100644
--- a/src/generatively_unrolled_functions.jl
+++ b/src/generatively_unrolled_functions.jl
@@ -1,93 +1,69 @@
-@generated _gen_unrolled_any(::Val{N}, f, itr) where {N} = Expr(
-    :block,
-    Expr(:meta, :inline),
-    Expr(:||, (:(f(generic_getindex(itr, $n))) for n in 1:N)...),
-)
+@generated _gen_unrolled_any(::Val{N}, f, itr) where {N} = quote
+    @inline
+    return Base.Cartesian.@nany $N n -> f(generic_getindex(itr, n))
+end
 @inline gen_unrolled_any(f, itr) = _gen_unrolled_any(Val(length(itr)), f, itr)
 
-@generated _gen_unrolled_all(::Val{N}, f, itr) where {N} = Expr(
-    :block,
-    Expr(:meta, :inline),
-    Expr(:&&, (:(f(generic_getindex(itr, $n))) for n in 1:N)...),
-)
+@generated _gen_unrolled_all(::Val{N}, f, itr) where {N} = quote
+    @inline
+    return Base.Cartesian.@nall $N n -> f(generic_getindex(itr, n))
+end
 @inline gen_unrolled_all(f, itr) = _gen_unrolled_all(Val(length(itr)), f, itr)
 
-@generated _gen_unrolled_foreach(::Val{N}, f, itr) where {N} = Expr(
-    :block,
-    Expr(:meta, :inline),
-    (:(f(generic_getindex(itr, $n))) for n in 1:N)...,
-    nothing,
-)
+@generated _gen_unrolled_foreach(::Val{N}, f, itr) where {N} = quote
+    @inline
+    Base.Cartesian.@nexprs $N n -> f(generic_getindex(itr, n))
+    return nothing
+end
 @inline gen_unrolled_foreach(f, itr) =
     _gen_unrolled_foreach(Val(length(itr)), f, itr)
 
-@generated _gen_unrolled_map(::Val{N}, f, itr) where {N} = Expr(
-    :block,
-    Expr(:meta, :inline),
-    Expr(:tuple, (:(f(generic_getindex(itr, $n))) for n in 1:N)...),
-)
+@generated _gen_unrolled_map(::Val{N}, f, itr) where {N} = quote
+    @inline
+    return Base.Cartesian.@ntuple $N n -> f(generic_getindex(itr, n))
+end
 @inline gen_unrolled_map(f, itr) = _gen_unrolled_map(Val(length(itr)), f, itr)
 
-@generated _gen_unrolled_applyat(::Val{N}, f, n, itr) where {N} = Expr(
-    :block,
-    Expr(:meta, :inline),
-    (:(n == $n && return f(generic_getindex(itr, $n))) for n in 1:N)...,
-    :(unrolled_applyat_bounds_error()),
-) # This block gets optimized into a switch instruction during LLVM codegen.
+@generated _gen_unrolled_applyat(::Val{N}, f, n′, itr) where {N} = quote
+    @inline
+    Base.Cartesian.@nexprs $N n ->
+        (n′ == n && return f(generic_getindex(itr, n)))
+    unrolled_applyat_bounds_error()
+end # This is optimized into a switch instruction during LLVM code generation.
 @inline gen_unrolled_applyat(f, n, itr) =
     _gen_unrolled_applyat(Val(length(itr)), f, n, itr)
 
-@generated _gen_unrolled_reduce(::Val{N}, op, itr, init) where {N} = Expr(
-    :block,
-    Expr(:meta, :inline),
-    foldl(
-        (op_expr, n) -> :(op($op_expr, generic_getindex(itr, $n))),
-        (init <: NoInit ? 2 : 1):N;
-        init = init <: NoInit ? :(generic_getindex(itr, 1)) : :init,
-    ), # Use foldl instead of reduce to guarantee left associativity.
-)
+@generated _gen_unrolled_reduce(::Val{N}, op, itr, init) where {N} = quote
+    @inline
+    value_0 = init
+    $N == 0 && return value_0
+    return Base.Cartesian.@nexprs $N n ->
+        (value_n = op(value_{n - 1}, generic_getindex(itr, n)))
+end
+@generated _gen_unrolled_reduce(::Val{N}, op, itr, ::NoInit) where {N} = quote
+    @inline
+    value_1 = generic_getindex(itr, 1)
+    $N == 1 && return value_1
+    return Base.Cartesian.@nexprs $(N - 1) n ->
+        (value_{n + 1} = op(value_n, generic_getindex(itr, n + 1)))
+end
 @inline gen_unrolled_reduce(op, itr, init) =
     _gen_unrolled_reduce(Val(length(itr)), op, itr, init)
 
-@generated function _gen_unrolled_accumulate(
+@generated _gen_unrolled_accumulate(
     ::Val{N},
     op,
     itr,
     init,
     transform,
-) where {N}
-    first_item_expr = :(generic_getindex(itr, 1))
-    init_expr = init <: NoInit ? first_item_expr : :(op(init, $first_item_expr))
-    transformed_exprs_and_op_exprs =
-        accumulate(1:N; init = (nothing, init_expr)) do (_, op_expr), n
-            var = gensym()
-            next_op_expr = :(op($var, generic_getindex(itr, $(n + 1))))
-            (:($var = $op_expr; transform($var)), next_op_expr)
-        end
-    return Expr(
-        :block,
-        Expr(:meta, :inline),
-        Expr(:tuple, Iterators.map(first, transformed_exprs_and_op_exprs)...),
-    )
+) where {N} = quote
+    @inline
+    $N == 0 && return ()
+    first_itr_item = generic_getindex(itr, 1)
+    value_1 = init isa NoInit ? first_itr_item : op(init, first_itr_item)
+    Base.Cartesian.@nexprs $(N - 1) n ->
+        (value_{n + 1} = op(value_n, generic_getindex(itr, n + 1)))
+    return Base.Cartesian.@ntuple $N n -> transform(value_n)
 end
 @inline gen_unrolled_accumulate(op, itr, init, transform) =
     _gen_unrolled_accumulate(Val(length(itr)), op, itr, init, transform)
-
-# TODO: The following is experimental and will likely be removed in the future.
-# For some reason, combining these two methods into one (or combining them with
-# the method for gen_unrolled_reduce defined above) causes compilation of the
-# non-orographic gravity wave parametrization test in ClimaAtmos to hang.
-# Wrapping the first method's result in a block and adding an inline annotation
-# also causes compilation to hang. Even using the assignment form of the first
-# method definition below (as opposed to the function syntax used here) causes
-# it to hang. This has not yet been replicated in a minimal working example.
-@generated function val_unrolled_reduce(op, ::Val{N}, init) where {N}
-    return foldl((:init, 1:N...)) do prev_op_expr, item_expr
-        :(op($prev_op_expr, $item_expr))
-    end
-end
-@generated val_unrolled_reduce(op, ::Val{N}, ::NoInit) where {N} = Expr(
-    :block,
-    Expr(:meta, :inline),
-    foldl((op_expr, item_expr) -> :(op($op_expr, $item_expr)), 1:N),
-)
diff --git a/src/recursion_limits.jl b/src/recursion_limits.jl
index 9f9c279..2098fe9 100644
--- a/src/recursion_limits.jl
+++ b/src/recursion_limits.jl
@@ -32,7 +32,6 @@
         gen_unrolled_applyat,
         gen_unrolled_reduce,
         gen_unrolled_accumulate,
-        val_unrolled_reduce,
         unrolled_any,
         unrolled_all,
         unrolled_foreach,
diff --git a/src/unrollable_iterator_interface.jl b/src/unrollable_iterator_interface.jl
index f17705f..dde3470 100644
--- a/src/unrollable_iterator_interface.jl
+++ b/src/unrollable_iterator_interface.jl
@@ -2,17 +2,10 @@
     rec_unroll(itr)
 
 Whether to use recursive loop unrolling instead of generative loop unrolling for
-the iterator `itr`.
-
-In general, recursive loop unrolling is faster to compile for small iterators,
-but it becomes extremely slow to compile for long iterators, and it usually
-generates suboptimal LLVM code for long iterators. On the other hand, generative
-loop unrolling is slow to compile for small iterators, but its compilation time
-does not grow as rapidly with respect to iterator size, and it always generates
-optimal LLVM code. The default is currently to use recursive unrolling for
-iterator lengths up to 16, and to use generative unrolling for longer iterators.
+the iterator `itr`. Recursive unrolling can lead to suboptimal LLVM code for
+iterators of more than 32 items, so this is set to `false` by default.
 """
-@inline rec_unroll(itr) = length(itr) <= 16
+@inline rec_unroll(itr) = false
 
 """
     generic_getindex(itr, n)
diff --git a/test/test_and_analyze.jl b/test/test_and_analyze.jl
index 400746e..e269ec0 100644
--- a/test/test_and_analyze.jl
+++ b/test/test_and_analyze.jl
@@ -877,7 +877,7 @@ title = "Very Long Iterators"
 comparison_table_dict = (comparison_table_dicts[title] = OrderedDict())
 
 @testset "unrolled functions of Tuples vs. StaticOneTos" begin
-    for itr in (ntuple(identity, 2000), StaticOneTo(2000), StaticOneTo(8185))
+    for itr in (ntuple(identity, 2000), StaticOneTo(2000), StaticOneTo(9000))
         @test_unrolled (itr,) unrolled_reduce(+, itr) reduce(+, itr) "Ints"
         @test_unrolled(
             (itr,),
@@ -885,24 +885,7 @@ comparison_table_dict = (comparison_table_dicts[title] = OrderedDict())
             mapreduce(log, +, itr),
             "Ints",
         )
-    end # These can each take 40 seconds to compile for ntuple(identity, 8185).
-    for itr in (ntuple(identity, 8186), StaticOneTo(8186))
-        @test_throws "gc handles" unrolled_reduce(+, itr)
-        @test_throws "gc handles" unrolled_mapreduce(log, +, itr)
-    end
-    # TODO: Why does the compiler throw an error when generating functions that
-    # get unrolled into more than 8185 lines of LLVM code?
-
-    for itr in (StaticOneTo(8185), StaticOneTo(8186))
-        @test_unrolled(
-            (itr,),
-            unrolled_reduce(+, Val(length(itr))),
-            reduce(+, itr),
-            "Ints",
-        )
-    end
-    @test_throws "gc handles" unrolled_reduce(+, Val(8188))
-    # TODO: Why is the limit 8186 for the Val version of unrolled_reduce?
+    end # These can take over a minute to compile for ntuple(identity, 9000).
 end
 
 title = "Generative vs. Recursive Unrolling"