Add support for non-Tuple iterators

CliMA · Sep 3, 2024 · 3a78132 · 3a78132
1 parent 91a30eb
commit 3a78132
Show file tree

Hide file tree

Showing 14 changed files with 1,485 additions and 399 deletions.
diff --git a/Project.toml b/Project.toml
@@ -5,6 +5,13 @@ version = "0.1.2"
 
 [compat]
 julia = "1.10"
+StaticArrays = "1"
+
+[weakdeps]
+StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
+
+[extensions]
+UnrolledUtilitiesStaticArraysExt = "StaticArrays"
 
 [extras]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"

diff --git a/docs/make.jl b/docs/make.jl
@@ -2,23 +2,25 @@ using Documenter
 
 include(joinpath("..", "test", "test_and_analyze.jl"))
 
-comparison_table_file = joinpath("docs", "src", "comparison_table.md")
+comparison_table_file = joinpath("docs", "src", "comparison_tables.md")
 
 open(comparison_table_file, "w") do io
-    println(io, "# Comparison Table\n```@raw html")
-    println(io, "<div style=\"width: max(80vw, 100%)\">") # use 80% of viewport
-    print_comparison_table(io, true)
-    println(io, "</div>")
-    println(io, "```")
+    println(io, "# Comparison Tables")
+    for (title, comparison_table_dict) in comparison_table_dicts
+        print_comparison_table(title, comparison_table_dict, io)
+    end
 end
 
 makedocs(;
     sitename = "UnrolledUtilities.jl",
     modules = [UnrolledUtilities],
-    pages = ["Home" => "index.md", "Comparison Table" => "comparison_table.md"],
+    pages = [
+        "Home" => "index.md",
+        "Comparison Tables" => "comparison_tables.md",
+    ],
     format = Documenter.HTML(
         prettyurls = get(ENV, "CI", nothing) == "true",
-        size_threshold_ignore = ["comparison_table.md"],
+        size_threshold_ignore = ["comparison_tables.md"],
     ),
     clean = true,
 )

diff --git a/docs/src/index.md b/docs/src/index.md
@@ -1,15 +1,28 @@
+```@meta
+CurrentModule = UnrolledUtilities
+```
+
 #  UnrolledUtilities.jl
 
-A collection of generated functions in which all loops are unrolled and inlined:
+## Unrolled Functions
+
+This package exports the following functions, in which all loops are unrolled
+and inlined:
 - `unrolled_any(f, itr)`: similar to `any`
 - `unrolled_all(f, itr)`: similar to `all`
 - `unrolled_foreach(f, itrs...)`: similar to `foreach`
 - `unrolled_map(f, itrs...)`: similar to `map`
+- `unrolled_applyat(f, n, itrs...)`: similar to `f(map(itr -> itr[n], itrs)...)`
 - `unrolled_reduce(op, itr; [init])`: similar to `reduce`
 - `unrolled_mapreduce(f, op, itrs...; [init])`: similar to `mapreduce`
-- `unrolled_zip(itrs...)`: similar to `zip`
-- `unrolled_enumerate(itrs...)`: similar to `enumerate`, but with the ability to
-  handle multiple iterators
+- `unrolled_accumulate(op, itr; [init], [transform])`: similar to `accumulate`,
+  but with an optional `transform` function applied to every accumulated value
+- `unrolled_push(itr, item)`: similar to `push!`, but non-mutating
+- `unrolled_append(itr1, itr2)`: similar to `append!`, but non-mutating
+- `unrolled_take(itr, ::Val{N})`: similar to `Iterators.take` (and to
+  `itr[1:N]`), but with `N` wrapped in a `Val`
+- `unrolled_drop(itr, ::Val{N})`: similar to `Iterators.drop` (and to
+  `itr[(N + 1):end]`), but with `N` wrapped in a `Val`
 - `unrolled_in(item, itr)`: similar to `in`
 - `unrolled_unique(itr)`: similar to `unique`
 - `unrolled_filter(f, itr)`: similar to `filter`
@@ -18,11 +31,6 @@ A collection of generated functions in which all loops are unrolled and inlined:
 - `unrolled_flatten(itr)`: similar to `Iterators.flatten`
 - `unrolled_flatmap(f, itrs...)`: similar to `Iterators.flatmap`
 - `unrolled_product(itrs...)`: similar to `Iterators.product`
-- `unrolled_applyat(f, n, itrs...)`: similar to `f(map(itr -> itr[n], itrs)...)`
-- `unrolled_take(itr, ::Val{N})`: similar to `itr[1:N]` (and to
-  `Iterators.take`), but with `N` wrapped in a `Val`
-- `unrolled_drop(itr, ::Val{N})`: similar to `itr[(N + 1):end]` (and to
-  `Iterators.drop`), but with `N` wrapped in a `Val`
 
 These functions are guaranteed to be type-stable whenever they are given
 iterators with inferrable lengths and element types, including when
@@ -42,34 +50,77 @@ iterators have singleton element types (and when the result of calling `f`
 and/or `op` on these elements is inferrable). However, they can also be much
 more expensive to compile than their counterparts from `Base` and
 `Base.Iterators`, in which case they should not be used unless there is a clear
-performance benefit. Some notable exceptions to this are `unrolled_zip`,
-`unrolled_take`, and `unrolled_drop`, which tend to be easier to compile than
-`zip`, `Iterators.take`, `Iterators.drop`, and standard indexing notation.
+performance benefit. Two notable exceptions to this are `unrolled_take` and
+`unrolled_drop`, which are faster to compile than their non-static versions.
+
+## Interface
+
+These functions can be used to unroll loops over all iterators with statically
+inferrable lengths. Compatibility with any such iterator type can be added
+through the following interface:
+
+```@docs
+rec_unroll
+generic_getindex
+output_type_for_promotion
+NoOutputType
+ConditionalOutputType
+output_promote_rule
+constructor_from_tuple
+```
+
+This interface is used to provide built-in compatibility with
+- statically sized iterators from `Base` (`Tuple` and `NamedTuple`)
+- lazy iterators from `Base` (`enumerate`, `zip`, `Iterators.map`, and other
+  generator expressions)
+- statically sized iterators from
+  [StaticArrays.jl](https://github.com/JuliaArrays/StaticArrays.jl) (`SVector`
+  and `MVector`)
+- custom lazy and low-storage iterators (`StaticOneTo` and `StaticBitVector`)
+
+```@docs
+StaticOneTo
+StaticBitVector
+```
+
+## When to Unroll
 
 For a more precise indication of whether you should use `UnrolledUtilities`,
-please consult the autogenerated [Comparison Table](@ref). This table contains a
-comprehensive set of potential use cases, each with a measurement of performance
-optimization, the time required for compilation, and the memory usage during
-compilation. Most cases involve simple functions `f` and/or `op`, but the last
-few demonstrate the benefits of unrolling with non-trivial recursive functions.
+please consult the autogenerated [Comparison Tables](@ref). These tables contain
+a comprehensive set of potential use cases, along with a few measurements that
+summarize their performance, compilation, and allocations:
+- run time (best of several trial measurements)
+- compilation time (as reported by the compiler)
+- overall level of optimization (type stability, constant propagation, etc.) and
+  allocations during run time (as reported by the garbage collector)
+- total allocations during compilation and first run (as reported by the garbage
+  collector and, when possible, the Julia process's resident set size estimator)
 
-The rows of the table are highlighted as follows:
-- green indicates an improvement in performance and either no change in
-  compilation or easier compilation (i.e., either similar or smaller values of
-  compilation time and memory usage)
-- dark blue indicates an improvement in performance and harder compilation
-  (i.e., larger values of compilation time and/or memory usage)
-- light blue indicates no change in performance and easier compilation
-- yellow indicates no change in performance and no change in compilation
-- magenta indicates no change in performance, an increase in compilation time,
-  and a decrease in compilation memory usage
-- red indicates no change in performance and harder compilation
+The rows of the tables are highlighted as follows:
+- light blue indicates an improvement in performance due to better optimization
+  and either an improvement or no change in compilation time and total
+  allocations
+- green indicates either faster run time or fewer allocations during run time
+  and either an improvement or no change in compilation time and total
+  allocations
+- dark blue indicates an improvement in performance due to better optimization
+  and either slower compilation or more total allocations
+- yellow indicates either faster run time or fewer allocations during run time
+  and either slower compilation or more total allocations
+- magenta indicates no change in performance and either an improvement or no
+  change in compilation time and total allocations
+- light gray indicates no change in performance and no change in compilation
+  time and total allocations
+- dark gray indicates no change in performance and either faster compilation
+  with more total allocations or slower compilation with fewer total allocations
+- red indicates a deterioration in performance, or no change in
+  performance and either slower compilation or more total allocations
 
-Rows highlighted in green and blue present a clear advantage for unrolling,
-whereas those highlighted in yellow, magenta, and red either have no clear
-advantage, or they have a clear disadvantage. It is recommended that you only
-unroll when your use case is similar to a row in the first category.
+Rows highlighted in gray present no clear advantage to unrolling, while those
+highlighted in red present a clear disadvantage. It is recommended that you only
+unroll when your use case is similar to a row in one of the remaining
+categories, each of which demonstrates some advantage to unrolling.
 
-The table is also printed out by this package's unit tests, so these
+The tables are also printed out by this package's unit tests, so these
 measurements can be compared across different operating systems by checking the
 [CI pipeline](https://github.com/CliMA/UnrolledUtilities.jl/actions/workflows/ci.yml).
diff --git a/ext/UnrolledUtilitiesStaticArraysExt.jl b/ext/UnrolledUtilitiesStaticArraysExt.jl
@@ -0,0 +1,12 @@
+module UnrolledUtilitiesStaticArraysExt
+
+import UnrolledUtilities
+import StaticArrays: SVector, MVector
+
+@inline UnrolledUtilities.output_type_for_promotion(::SVector) = SVector
+@inline UnrolledUtilities.constructor_from_tuple(::Type{SVector}) = SVector
+
+@inline UnrolledUtilities.output_type_for_promotion(::MVector) = MVector
+@inline UnrolledUtilities.constructor_from_tuple(::Type{MVector}) = MVector
+
+end
diff --git a/src/StaticBitVector.jl b/src/StaticBitVector.jl
@@ -0,0 +1,155 @@
+"""
+    StaticBitVector{N, [U]}(f)
+    StaticBitVector{N, [U]}([bit])
+
+A statically-sized analogue of `BitVector` with `Unsigned` chunks of type `U`,
+which can be constructed using either a function `f(n)` or a constant `bit`. By
+default, `U` is set to `UInt8` and `bit` is set to `false`.
+
+This iterator can only store `Bool`s, so its `output_type_for_promotion` is a
+`ConditionalOutputType`. Efficient methods are provided for `unrolled_map`,
+`unrolled_accumulate`, `unrolled_take`, and `unrolled_drop`, though the methods
+for `unrolled_map` and `unrolled_accumulate` only apply when their output's
+first item is a `Bool`. No other unrolled functions can use `StaticBitVector`s
+as output types.
+"""
+struct StaticBitVector{N, U <: Unsigned, I <: NTuple{<:Any, U}} <:
+       StaticSequence{N}
+    ints::I
+end
+@inline StaticBitVector{N, U}(ints) where {N, U} =
+    StaticBitVector{N, U, typeof(ints)}(ints)
+@inline StaticBitVector{N}(args...) where {N} =
+    StaticBitVector{N, UInt8}(args...)
+
+@inline function StaticBitVector{N, U}(bit::Bool = false) where {N, U}
+    n_bits_per_int = 8 * sizeof(U)
+    n_ints = cld(N, n_bits_per_int)
+    ints = ntuple(Returns(bit ? ~zero(U) : zero(U)), Val(n_ints))
+    return StaticBitVector{N, U}(ints)
+end
+
+@inline function StaticBitVector{N, U}(f::Function) where {N, U}
+    n_bits_per_int = 8 * sizeof(U)
+    n_ints = cld(N, n_bits_per_int)
+    ints = ntuple(Val(n_ints)) do int_index
+        @inline
+        first_index = n_bits_per_int * (int_index - 1) + 1
+        unrolled_reduce(
+            StaticOneTo(min(n_bits_per_int, N - first_index + 1));
+            init = zero(U),
+        ) do int, bit_index
+            @inline
+            bit_offset = bit_index - 1
+            int | U(f(first_index + bit_offset)::Bool) << bit_offset
+        end
+    end
+    return StaticBitVector{N, U}(ints)
+end
+
+@inline function int_index_and_bit_offset(::Type{U}, n) where {U}
+    int_offset, bit_offset = divrem(n - 1, 8 * sizeof(U))
+    return (int_offset + 1, bit_offset)
+end
+
+@inline function generic_getindex(
+    itr::StaticBitVector{<:Any, U},
+    n::Integer,
+) where {U}
+    int_index, bit_offset = int_index_and_bit_offset(U, n)
+    int = itr.ints[int_index]
+    return Bool(int >> bit_offset & one(int))
+end
+
+@inline function Base.setindex(
+    itr::StaticBitVector{N, U},
+    bit::Bool,
+    n::Integer,
+) where {N, U}
+    int_index, bit_offset = int_index_and_bit_offset(U, n)
+    int = itr.ints[int_index]
+    int′ = int & ~(one(int) << bit_offset) | U(bit) << bit_offset
+    ints = Base.setindex(itr.ints, int′, int_index)
+    return StaticBitVector{N, U}(ints)
+end
+
+@inline output_type_for_promotion(::StaticBitVector{<:Any, U}) where {U} =
+    ConditionalOutputType(Bool, StaticBitVector{<:Any, U})
+
+@inline function unrolled_map_into(
+    ::Type{StaticBitVector{<:Any, U}},
+    f,
+    itrs...,
+) where {U}
+    lazy_itr = Iterators.map(f, itrs...)
+    N = length(lazy_itr)
+    return StaticBitVector{N, U}(Base.Fix1(generic_getindex, lazy_itr))
+end
+
+@inline function unrolled_accumulate_into(
+    ::Type{StaticBitVector{<:Any, U}},
+    op,
+    itr,
+    init,
+    transform,
+) where {U}
+    N = length(itr)
+    n_bits_per_int = 8 * sizeof(U)
+    n_ints = cld(N, n_bits_per_int)
+    ints = unrolled_accumulate_into_tuple(
+        StaticOneTo(n_ints);
+        init = (nothing, init),
+        transform = first,
+    ) do (_, init_value_for_new_int), int_index
+        @inline
+        first_index = n_bits_per_int * (int_index - 1) + 1
+        unrolled_reduce(
+            StaticOneTo(min(n_bits_per_int, N - first_index + 1));
+            init = (zero(U), init_value_for_new_int),
+        ) do (int, prev_value), bit_index
+            @inline
+            bit_offset = bit_index - 1
+            item = generic_getindex(itr, first_index + bit_offset)
+            new_value =
+                first_index + bit_offset == 1 && prev_value isa NoInit ?
+                item : op(prev_value, item)
+            (int | U(transform(new_value)::Bool) << bit_offset, new_value)
+        end
+    end
+    return StaticBitVector{N, U}(ints)
+end
+
+# TODO: Add unrolled_push and unrolled_append
+
+@inline function unrolled_take(
+    itr::StaticBitVector{<:Any, U},
+    ::Val{N},
+) where {N, U}
+    n_bits_per_int = 8 * sizeof(U)
+    n_ints = cld(N, n_bits_per_int)
+    ints = unrolled_take(itr.ints, Val(n_ints))
+    return StaticBitVector{N, U}(ints)
+end
+
+@inline function unrolled_drop(
+    itr::StaticBitVector{N_old, U},
+    ::Val{N},
+) where {N_old, N, U}
+    n_bits_per_int = 8 * sizeof(U)
+    n_ints = cld(N_old - N, n_bits_per_int)
+    n_dropped_ints = length(itr.ints) - n_ints
+    bit_offset = N - n_bits_per_int * n_dropped_ints
+    ints_without_offset = unrolled_drop(itr.ints, Val(n_dropped_ints))
+    ints = if bit_offset == 0
+        ints_without_offset
+    else
+        cur_ints = ints_without_offset
+        next_ints = unrolled_push(unrolled_drop(cur_ints, Val(1)), nothing)
+        unrolled_map_into_tuple(cur_ints, next_ints) do cur_int, next_int
+            @inline
+            isnothing(next_int) ? cur_int >> bit_offset :
+            cur_int >> bit_offset | next_int << (n_bits_per_int - bit_offset)
+        end
+    end
+    return StaticBitVector{N_old - N, U}(ints)
+end
diff --git a/src/StaticOneTo.jl b/src/StaticOneTo.jl
@@ -0,0 +1,18 @@
+"""
+    StaticOneTo(N)
+
+A lazy and statically-sized analogue of `Base.OneTo(N)`.
+
+This iterator can only store the integers from 1 to `N`, so its
+`output_type_for_promotion` is `NoOutputType()`. An efficient method is provided
+for `unrolled_take`, but no other unrolled functions can use `StaticOneTo`s as
+output types.
+"""
+struct StaticOneTo{N} <: StaticSequence{N} end
+@inline StaticOneTo(N) = StaticOneTo{N}()
+
+@inline generic_getindex(::StaticOneTo, n) = n
+
+@inline output_type_for_promotion(::StaticOneTo) = NoOutputType()
+
+@inline unrolled_take(::StaticOneTo, ::Val{N}) where {N} = StaticOneTo(N)