Add support for non-Tuple iterators

CliMA · Aug 28, 2024 · ba93856 · ba93856
1 parent 91a30eb
commit ba93856
Show file tree

Hide file tree

Showing 10 changed files with 943 additions and 189 deletions.
diff --git a/Project.toml b/Project.toml
@@ -5,6 +5,13 @@ version = "0.1.2"
 
 [compat]
 julia = "1.10"
+StaticArrays = "1"
+
+[weakdeps]
+StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
+
+[extensions]
+UnrolledUtilitiesStaticArraysExt = "StaticArrays"
 
 [extras]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"

diff --git a/docs/src/index.md b/docs/src/index.md
@@ -1,16 +1,26 @@
+```@meta
+CurrentModule = UnrolledUtilities
+```
+
 #  UnrolledUtilities.jl
 
-A collection of generated functions in which all loops are unrolled and inlined:
+## Unrolled Functions
+
+This package exports the following functions, in which all loops are unrolled
+and inlined:
 - `unrolled_any(f, itr)`: similar to `any`
 - `unrolled_all(f, itr)`: similar to `all`
 - `unrolled_foreach(f, itrs...)`: similar to `foreach`
 - `unrolled_map(f, itrs...)`: similar to `map`
 - `unrolled_reduce(op, itr; [init])`: similar to `reduce`
 - `unrolled_mapreduce(f, op, itrs...; [init])`: similar to `mapreduce`
-- `unrolled_zip(itrs...)`: similar to `zip`
-- `unrolled_enumerate(itrs...)`: similar to `enumerate`, but with the ability to
-  handle multiple iterators
+- `unrolled_accumulate(op, itr; [init], [transform])`: similar to `accumulate`,
+  but with an optional `transform` function applied to every accumulated value
+- `unrolled_mapaccumulate(f, op, itrs...; [init], [transform])`: a combination
+  of `unrolled_map` and `unrolled_accumulate`, analogous to `unrolled_mapreduce`
 - `unrolled_in(item, itr)`: similar to `in`
+- `unrolled_push(itr, item)`: similar to `push!`, but non-mutating
+- `unrolled_append(itr, item)`: similar to `append!`, but non-mutating
 - `unrolled_unique(itr)`: similar to `unique`
 - `unrolled_filter(f, itr)`: similar to `filter`
 - `unrolled_split(f, itr)`: similar to `(filter(f, itr), filter(!f, itr))`, but
@@ -42,33 +52,70 @@ iterators have singleton element types (and when the result of calling `f`
 and/or `op` on these elements is inferrable). However, they can also be much
 more expensive to compile than their counterparts from `Base` and
 `Base.Iterators`, in which case they should not be used unless there is a clear
-performance benefit. Some notable exceptions to this are `unrolled_zip`,
-`unrolled_take`, and `unrolled_drop`, which tend to be easier to compile than
-`zip`, `Iterators.take`, `Iterators.drop`, and standard indexing notation.
+performance benefit. Two notable exceptions to this are `unrolled_take` and
+`unrolled_drop`, which are faster to compile than their non-static versions.
+
+## Interface
+
+These functions can be used to unroll loops over all iterators with statically
+inferrable lengths. Compatibility with any such iterator type can be added
+through the following interface:
+
+```@docs
+type_length
+extended_getindex
+output_type_for_promotion
+ConditionalOutputType
+NoOutputType
+output_promote_rule
+constructor_from_tuple
+```
+
+This interface is used to provide built-in compatibility with
+- statically sized iterators from `Base` (`Tuple` and `NamedTuple`)
+- lazy iterators from `Base` (`enumerate`, `zip`, `Iterators.map`, and other
+  generator expressions)
+- statically sized iterators from
+  [StaticArrays.jl](https://github.com/JuliaArrays/StaticArrays.jl) (`SVector`
+  and `MVector`)
+- custom lazy and low-storage iterators (`LazySequence` and `BitSequence`)
+
+```@docs
+StaticSequence
+LazySequence
+StaticOneTo
+BitSequence
+```
+
+## When to Unroll
 
 For a more precise indication of whether you should use `UnrolledUtilities`,
 please consult the autogenerated [Comparison Table](@ref). This table contains a
-comprehensive set of potential use cases, each with a measurement of performance
-optimization, the time required for compilation, and the memory usage during
-compilation. Most cases involve simple functions `f` and/or `op`, but the last
-few demonstrate the benefits of unrolling with non-trivial recursive functions.
+comprehensive set of potential use cases, along with a few measurements that
+summarize their performance, compilation, and allocations:
+- overall level of optimization (type stability, constant propagation, etc.)
+- run time (best of several trial measurements)
+- compilation time (as reported by the compiler)
+- memory usage during compilation and first run (as reported by the garbage
+  collector and, when possible, the Julia process's resident set size estimator)
 
 The rows of the table are highlighted as follows:
-- green indicates an improvement in performance and either no change in
-  compilation or easier compilation (i.e., either similar or smaller values of
-  compilation time and memory usage)
-- dark blue indicates an improvement in performance and harder compilation
-  (i.e., larger values of compilation time and/or memory usage)
-- light blue indicates no change in performance and easier compilation
-- yellow indicates no change in performance and no change in compilation
-- magenta indicates no change in performance, an increase in compilation time,
-  and a decrease in compilation memory usage
-- red indicates no change in performance and harder compilation
+- green indicates an improvement in performance and either an improvement or
+  no change in compilation and allocations
+- dark blue indicates an improvement in performance and either slower
+  compilation or more allocations
+- light blue indicates no change in performance and either faster compilation or
+  fewer allocations
+- magenta indicates no change in performance and either faster compilation with
+  more allocations or slower compilation with fewer allocations
+- yellow indicates no change in performance, compilation, or allocations
+- red indicates a deterioration in performance, or no change in
+  performance and either slower compilation or more allocations
 
 Rows highlighted in green and blue present a clear advantage for unrolling,
-whereas those highlighted in yellow, magenta, and red either have no clear
-advantage, or they have a clear disadvantage. It is recommended that you only
-unroll when your use case is similar to a row in the first category.
+whereas those highlighted in magenta, yellow, and red either have no clear
+advantage or have a clear disadvantage. It is recommended that you only unroll
+when your use case is similar to a row in the first category.
 
 The table is also printed out by this package's unit tests, so these
 measurements can be compared across different operating systems by checking the

diff --git a/ext/UnrolledUtilitiesStaticArraysExt.jl b/ext/UnrolledUtilitiesStaticArraysExt.jl
@@ -0,0 +1,14 @@
+module UnrolledUtilitiesStaticArraysExt
+
+import UnrolledUtilities
+import StaticArrays: SVector, MVector
+
+UnrolledUtilities.type_length(::Type{<:SVector{N}}) where {N} = N
+UnrolledUtilities.output_type_for_promotion(::SVector) = SVector
+UnrolledUtilities.constructor_from_tuple(::Type{SVector}) = SVector
+
+UnrolledUtilities.type_length(::Type{<:MVector{N}}) where {N} = N
+UnrolledUtilities.output_type_for_promotion(::MVector) = MVector
+UnrolledUtilities.constructor_from_tuple(::Type{MVector}) = MVector
+
+end
diff --git a/src/BitSequence.jl b/src/BitSequence.jl
@@ -0,0 +1,144 @@
+"""
+    BitSequence{N, [U]}(f)
+    BitSequence{N, [U]}([bit])
+
+A statically-sized analogue of `BitVector` with `Unsigned` chunks of type `U`,
+which can be constructed using either a function `f(n)` or a constant `bit`. By
+default, `U` is set to `UInt8` and `bit` is set to `false`.
+
+This iterator can only store `Bool`s, so its `output_type_for_promotion` is a
+`ConditionalOutputType`. Efficient methods are provided for `unrolled_map`,
+`unrolled_accumulate`, `unrolled_take`, and `unrolled_drop`, though the methods
+for `unrolled_map` and `unrolled_accumulate` only apply when the first items in
+their outputs are `Bool`s. All other unrolled functions that need to generate
+iterators use output types that are not `BitSequence`s.
+"""
+struct BitSequence{N, U <: Unsigned, I <: NTuple{<:Any, U}} <: StaticSequence{N}
+    ints::I
+end
+BitSequence{N, U}(ints) where {N, U} = BitSequence{N, U, typeof(ints)}(ints)
+BitSequence{N}(args...) where {N} = BitSequence{N, UInt8}(args...)
+
+function BitSequence{N, U}(bit::Bool = false) where {N, U}
+    n_bits_per_int = 8 * sizeof(U)
+    n_ints = cld(N, n_bits_per_int)
+    int = bit ? ~zero(U) : zero(U)
+    ints = ntuple(_ -> int, Val(n_ints))
+    return BitSequence{N, U}(ints)
+end
+
+function BitSequence{N, U}(f::Function) where {N, U}
+    n_bits_per_int = 8 * sizeof(U)
+    n_ints = cld(N, n_bits_per_int)
+    ints = ntuple(Val(n_ints)) do int_index
+        @inline
+        first_index = n_bits_per_int * (int_index - 1) + 1
+        unrolled_reduce(
+            LazySequence{min(n_bits_per_int, N - first_index + 1)}(0);
+            init = zero(U),
+        ) do int, bit_offset
+            int | U(f(first_index + bit_offset)::Bool) << bit_offset
+        end
+    end
+    return BitSequence{N, U}(ints)
+end
+
+@inline function int_index_and_bit_offset(::Type{U}, n) where {U}
+    int_offset, bit_offset = divrem(n - 1, 8 * sizeof(U))
+    return (int_offset + 1, bit_offset)
+end
+
+@inline function Base.getindex(itr::BitSequence{<:Any, U}, n::Integer) where {U}
+    int_index, bit_offset = int_index_and_bit_offset(U, n)
+    int = itr.ints[int_index]
+    return Bool(int >> bit_offset & one(int))
+end
+
+@inline function Base.setindex(
+    itr::BitSequence{N, U},
+    bit::Bool,
+    n::Integer,
+) where {N, U}
+    int_index, bit_offset = int_index_and_bit_offset(U, n)
+    int = itr.ints[int_index]
+    int′ = int & ~(one(int) << bit_offset) | U(bit) << bit_offset
+    return BitSequence{N, U}(Base.setindex(itr.ints, int′, int_index))
+end
+
+output_type_for_promotion(::BitSequence{<:Any, U}) where {U} =
+    ConditionalOutputType(Bool, BitSequence{<:Any, U})
+
+@inline function unrolled_map_into(
+    ::Type{BitSequence{<:Any, U}},
+    f,
+    itrs...,
+) where {U}
+    lazy_itr = Iterators.map(f, itrs...)
+    N = inferred_length(lazy_itr)
+    return BitSequence{N, U}(Base.Fix1(extended_getindex, lazy_itr))
+end
+
+@inline function unrolled_accumulate_into(
+    ::Type{BitSequence{<:Any, U}},
+    op,
+    itr,
+    init,
+    transform,
+) where {U}
+    N = inferred_length(itr)
+    (N == 0 && init isa NoInit) &&
+        error("unrolled_accumulate requires an init value for empty iterators")
+    n_bits_per_int = 8 * sizeof(U)
+    n_ints = cld(N, n_bits_per_int)
+    ints = unrolled_accumulate_into_tuple(
+        LazySequence{n_ints}();
+        init = (nothing, init),
+        transform = first,
+    ) do (_, init_value_for_new_int), int_index
+        @inline
+        first_index = n_bits_per_int * (int_index - 1) + 1
+        unrolled_reduce(
+            LazySequence{min(n_bits_per_int, N - first_index + 1)}(0);
+            init = (zero(U), init_value_for_new_int),
+        ) do (int, prev_value), bit_offset
+            item = extended_getindex(itr, first_index + bit_offset)
+            new_value =
+                first_index + bit_offset == 1 && prev_value isa NoInit ?
+                item : op(prev_value, item)
+            (int | U(transform(new_value)::Bool) << bit_offset, new_value)
+        end
+    end
+    return BitSequence{N, U}(ints)
+end
+
+@inline function unrolled_take(
+    itr::BitSequence{<:Any, U},
+    ::Val{N},
+) where {N, U}
+    n_bits_per_int = 8 * sizeof(U)
+    n_ints = cld(N, n_bits_per_int)
+    ints = unrolled_take(itr.ints, Val(n_ints))
+    return BitSequence{N, U}(ints)
+end
+
+@inline function unrolled_drop(
+    itr::BitSequence{N_old, U},
+    ::Val{N},
+) where {N_old, N, U}
+    n_bits_per_int = 8 * sizeof(U)
+    n_ints = cld(N_old - N, n_bits_per_int)
+    n_dropped_ints = length(itr.ints) - n_ints
+    bit_offset = N - n_bits_per_int * n_dropped_ints
+    ints_without_offset = unrolled_drop(itr.ints, Val(n_dropped_ints))
+    ints = if bit_offset == 0
+        ints_without_offset
+    else
+        cur_ints = ints_without_offset
+        next_ints = unrolled_push(unrolled_drop(cur_ints, Val(1)), nothing)
+        unrolled_map_into_tuple(cur_ints, next_ints) do cur_int, next_int
+            isnothing(next_int) ? cur_int >> bit_offset :
+            cur_int >> bit_offset | next_int << (n_bits_per_int - bit_offset)
+        end
+    end
+    return BitSequence{N_old - N, U}(ints)
+end
diff --git a/src/LazySequence.jl b/src/LazySequence.jl
@@ -0,0 +1,50 @@
+"""
+    LazySequence{N}(f)
+    LazySequence{N}([start])
+
+A lazy analogue of `ntuple(f, Val(N))`, or a lazy and statically-sized analogue
+of `start:(start - 1 + N)`. By default, `start` is set to 1.
+
+Efficient methods are provided for `unrolled_take` and `unrolled_drop`. All
+other unrolled functions that need to generate iterators use output types that
+are not `LazySequence`s.
+"""
+struct LazySequence{N, F} <: StaticSequence{N}
+    f::F
+end
+LazySequence{N}(f::Function = identity) where {N} =
+    LazySequence{N, typeof(f)}(f)
+LazySequence{N}(start::Number) where {N} =
+    LazySequence{N}(Base.Fix1(+, start - one(start)))
+
+@inline Base.getindex(itr::LazySequence, n::Integer) = itr.f(n)
+
+output_type_for_promotion(::LazySequence) = NoOutputType()
+
+output_promote_rule(::Type{LazySequence}, ::Type{O}) where {O} = O
+
+@inline unrolled_take(itr::LazySequence, ::Val{N}) where {N} =
+    LazySequence{N}(itr.f)
+
+@inline unrolled_drop(itr::LazySequence{N_old}, ::Val{N}) where {N_old, N} =
+    LazySequence{N_old - N}(n -> itr.f(n + N))
+
+"""
+    StaticOneTo(N)
+
+A lazy analogue of `Base.OneTo(N)`.
+
+An efficient method is provided for `unrolled_take`. All other unrolled
+functions that need to generate iterators use output types that are not
+`StaticOneTo`s.
+"""
+struct StaticOneTo{N} <: StaticSequence{N} end
+StaticOneTo(N) = StaticOneTo{N}()
+
+@inline Base.getindex(::StaticOneTo, n::Integer) = n
+
+output_type_for_promotion(::StaticOneTo) = NoOutputType()
+
+output_promote_rule(::Type{StaticOneTo}, ::Type{O}) where {O} = O
+
+@inline unrolled_take(::StaticOneTo, ::Val{N}) where {N} = StaticOneTo(N)