Skip to content

Commit

Permalink
Add support for non-Tuple iterators
Browse files Browse the repository at this point in the history
  • Loading branch information
dennisYatunin committed Sep 3, 2024
1 parent 91a30eb commit 3a78132
Show file tree
Hide file tree
Showing 14 changed files with 1,485 additions and 399 deletions.
7 changes: 7 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,13 @@ version = "0.1.2"

[compat]
julia = "1.10"
StaticArrays = "1"

[weakdeps]
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"

[extensions]
UnrolledUtilitiesStaticArraysExt = "StaticArrays"

[extras]
Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
Expand Down
18 changes: 10 additions & 8 deletions docs/make.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,25 @@ using Documenter

include(joinpath("..", "test", "test_and_analyze.jl"))

comparison_table_file = joinpath("docs", "src", "comparison_table.md")
comparison_table_file = joinpath("docs", "src", "comparison_tables.md")

open(comparison_table_file, "w") do io
println(io, "# Comparison Table\n```@raw html")
println(io, "<div style=\"width: max(80vw, 100%)\">") # use 80% of viewport
print_comparison_table(io, true)
println(io, "</div>")
println(io, "```")
println(io, "# Comparison Tables")
for (title, comparison_table_dict) in comparison_table_dicts
print_comparison_table(title, comparison_table_dict, io)
end
end

makedocs(;
sitename = "UnrolledUtilities.jl",
modules = [UnrolledUtilities],
pages = ["Home" => "index.md", "Comparison Table" => "comparison_table.md"],
pages = [
"Home" => "index.md",
"Comparison Tables" => "comparison_tables.md",
],
format = Documenter.HTML(
prettyurls = get(ENV, "CI", nothing) == "true",
size_threshold_ignore = ["comparison_table.md"],
size_threshold_ignore = ["comparison_tables.md"],
),
clean = true,
)
Expand Down
117 changes: 84 additions & 33 deletions docs/src/index.md
Original file line number Diff line number Diff line change
@@ -1,15 +1,28 @@
```@meta
CurrentModule = UnrolledUtilities
```

# UnrolledUtilities.jl

A collection of generated functions in which all loops are unrolled and inlined:
## Unrolled Functions

This package exports the following functions, in which all loops are unrolled
and inlined:
- `unrolled_any(f, itr)`: similar to `any`
- `unrolled_all(f, itr)`: similar to `all`
- `unrolled_foreach(f, itrs...)`: similar to `foreach`
- `unrolled_map(f, itrs...)`: similar to `map`
- `unrolled_applyat(f, n, itrs...)`: similar to `f(map(itr -> itr[n], itrs)...)`
- `unrolled_reduce(op, itr; [init])`: similar to `reduce`
- `unrolled_mapreduce(f, op, itrs...; [init])`: similar to `mapreduce`
- `unrolled_zip(itrs...)`: similar to `zip`
- `unrolled_enumerate(itrs...)`: similar to `enumerate`, but with the ability to
handle multiple iterators
- `unrolled_accumulate(op, itr; [init], [transform])`: similar to `accumulate`,
but with an optional `transform` function applied to every accumulated value
- `unrolled_push(itr, item)`: similar to `push!`, but non-mutating
- `unrolled_append(itr1, itr2)`: similar to `append!`, but non-mutating
- `unrolled_take(itr, ::Val{N})`: similar to `Iterators.take` (and to
`itr[1:N]`), but with `N` wrapped in a `Val`
- `unrolled_drop(itr, ::Val{N})`: similar to `Iterators.drop` (and to
`itr[(N + 1):end]`), but with `N` wrapped in a `Val`
- `unrolled_in(item, itr)`: similar to `in`
- `unrolled_unique(itr)`: similar to `unique`
- `unrolled_filter(f, itr)`: similar to `filter`
Expand All @@ -18,11 +31,6 @@ A collection of generated functions in which all loops are unrolled and inlined:
- `unrolled_flatten(itr)`: similar to `Iterators.flatten`
- `unrolled_flatmap(f, itrs...)`: similar to `Iterators.flatmap`
- `unrolled_product(itrs...)`: similar to `Iterators.product`
- `unrolled_applyat(f, n, itrs...)`: similar to `f(map(itr -> itr[n], itrs)...)`
- `unrolled_take(itr, ::Val{N})`: similar to `itr[1:N]` (and to
`Iterators.take`), but with `N` wrapped in a `Val`
- `unrolled_drop(itr, ::Val{N})`: similar to `itr[(N + 1):end]` (and to
`Iterators.drop`), but with `N` wrapped in a `Val`

These functions are guaranteed to be type-stable whenever they are given
iterators with inferrable lengths and element types, including when
Expand All @@ -42,34 +50,77 @@ iterators have singleton element types (and when the result of calling `f`
and/or `op` on these elements is inferrable). However, they can also be much
more expensive to compile than their counterparts from `Base` and
`Base.Iterators`, in which case they should not be used unless there is a clear
performance benefit. Some notable exceptions to this are `unrolled_zip`,
`unrolled_take`, and `unrolled_drop`, which tend to be easier to compile than
`zip`, `Iterators.take`, `Iterators.drop`, and standard indexing notation.
performance benefit. Two notable exceptions to this are `unrolled_take` and
`unrolled_drop`, which are faster to compile than their non-static versions.

## Interface

These functions can be used to unroll loops over all iterators with statically
inferrable lengths. Compatibility with any such iterator type can be added
through the following interface:

```@docs
rec_unroll
generic_getindex
output_type_for_promotion
NoOutputType
ConditionalOutputType
output_promote_rule
constructor_from_tuple
```

This interface is used to provide built-in compatibility with
- statically sized iterators from `Base` (`Tuple` and `NamedTuple`)
- lazy iterators from `Base` (`enumerate`, `zip`, `Iterators.map`, and other
generator expressions)
- statically sized iterators from
[StaticArrays.jl](https://github.com/JuliaArrays/StaticArrays.jl) (`SVector`
and `MVector`)
- custom lazy and low-storage iterators (`StaticOneTo` and `StaticBitVector`)

```@docs
StaticOneTo
StaticBitVector
```

## When to Unroll

For a more precise indication of whether you should use `UnrolledUtilities`,
please consult the autogenerated [Comparison Table](@ref). This table contains a
comprehensive set of potential use cases, each with a measurement of performance
optimization, the time required for compilation, and the memory usage during
compilation. Most cases involve simple functions `f` and/or `op`, but the last
few demonstrate the benefits of unrolling with non-trivial recursive functions.
please consult the autogenerated [Comparison Tables](@ref). These tables contain
a comprehensive set of potential use cases, along with a few measurements that
summarize their performance, compilation, and allocations:
- run time (best of several trial measurements)
- compilation time (as reported by the compiler)
- overall level of optimization (type stability, constant propagation, etc.) and
allocations during run time (as reported by the garbage collector)
- total allocations during compilation and first run (as reported by the garbage
collector and, when possible, the Julia process's resident set size estimator)

The rows of the table are highlighted as follows:
- green indicates an improvement in performance and either no change in
compilation or easier compilation (i.e., either similar or smaller values of
compilation time and memory usage)
- dark blue indicates an improvement in performance and harder compilation
(i.e., larger values of compilation time and/or memory usage)
- light blue indicates no change in performance and easier compilation
- yellow indicates no change in performance and no change in compilation
- magenta indicates no change in performance, an increase in compilation time,
and a decrease in compilation memory usage
- red indicates no change in performance and harder compilation
The rows of the tables are highlighted as follows:
- light blue indicates an improvement in performance due to better optimization
and either an improvement or no change in compilation time and total
allocations
- green indicates either faster run time or fewer allocations during run time
and either an improvement or no change in compilation time and total
allocations
- dark blue indicates an improvement in performance due to better optimization
and either slower compilation or more total allocations
- yellow indicates either faster run time or fewer allocations during run time
and either slower compilation or more total allocations
- magenta indicates no change in performance and either an improvement or no
change in compilation time and total allocations
- light gray indicates no change in performance and no change in compilation
time and total allocations
- dark gray indicates no change in performance and either faster compilation
with more total allocations or slower compilation with fewer total allocations
- red indicates a deterioration in performance, or no change in
performance and either slower compilation or more total allocations

Rows highlighted in green and blue present a clear advantage for unrolling,
whereas those highlighted in yellow, magenta, and red either have no clear
advantage, or they have a clear disadvantage. It is recommended that you only
unroll when your use case is similar to a row in the first category.
Rows highlighted in gray present no clear advantage to unrolling, while those
highlighted in red present a clear disadvantage. It is recommended that you only
unroll when your use case is similar to a row in one of the remaining
categories, each of which demonstrates some advantage to unrolling.

The table is also printed out by this package's unit tests, so these
The tables are also printed out by this package's unit tests, so these
measurements can be compared across different operating systems by checking the
[CI pipeline](https://github.com/CliMA/UnrolledUtilities.jl/actions/workflows/ci.yml).
12 changes: 12 additions & 0 deletions ext/UnrolledUtilitiesStaticArraysExt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
module UnrolledUtilitiesStaticArraysExt

import UnrolledUtilities
import StaticArrays: SVector, MVector

@inline UnrolledUtilities.output_type_for_promotion(::SVector) = SVector
@inline UnrolledUtilities.constructor_from_tuple(::Type{SVector}) = SVector

@inline UnrolledUtilities.output_type_for_promotion(::MVector) = MVector
@inline UnrolledUtilities.constructor_from_tuple(::Type{MVector}) = MVector

end
155 changes: 155 additions & 0 deletions src/StaticBitVector.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
"""
StaticBitVector{N, [U]}(f)
StaticBitVector{N, [U]}([bit])
A statically-sized analogue of `BitVector` with `Unsigned` chunks of type `U`,
which can be constructed using either a function `f(n)` or a constant `bit`. By
default, `U` is set to `UInt8` and `bit` is set to `false`.
This iterator can only store `Bool`s, so its `output_type_for_promotion` is a
`ConditionalOutputType`. Efficient methods are provided for `unrolled_map`,
`unrolled_accumulate`, `unrolled_take`, and `unrolled_drop`, though the methods
for `unrolled_map` and `unrolled_accumulate` only apply when their output's
first item is a `Bool`. No other unrolled functions can use `StaticBitVector`s
as output types.
"""
struct StaticBitVector{N, U <: Unsigned, I <: NTuple{<:Any, U}} <:
StaticSequence{N}
ints::I
end
@inline StaticBitVector{N, U}(ints) where {N, U} =
StaticBitVector{N, U, typeof(ints)}(ints)
@inline StaticBitVector{N}(args...) where {N} =
StaticBitVector{N, UInt8}(args...)

@inline function StaticBitVector{N, U}(bit::Bool = false) where {N, U}
n_bits_per_int = 8 * sizeof(U)
n_ints = cld(N, n_bits_per_int)
ints = ntuple(Returns(bit ? ~zero(U) : zero(U)), Val(n_ints))
return StaticBitVector{N, U}(ints)
end

@inline function StaticBitVector{N, U}(f::Function) where {N, U}
n_bits_per_int = 8 * sizeof(U)
n_ints = cld(N, n_bits_per_int)
ints = ntuple(Val(n_ints)) do int_index
@inline
first_index = n_bits_per_int * (int_index - 1) + 1
unrolled_reduce(
StaticOneTo(min(n_bits_per_int, N - first_index + 1));
init = zero(U),
) do int, bit_index
@inline
bit_offset = bit_index - 1
int | U(f(first_index + bit_offset)::Bool) << bit_offset
end
end
return StaticBitVector{N, U}(ints)
end

@inline function int_index_and_bit_offset(::Type{U}, n) where {U}
int_offset, bit_offset = divrem(n - 1, 8 * sizeof(U))
return (int_offset + 1, bit_offset)
end

@inline function generic_getindex(
itr::StaticBitVector{<:Any, U},
n::Integer,
) where {U}
int_index, bit_offset = int_index_and_bit_offset(U, n)
int = itr.ints[int_index]
return Bool(int >> bit_offset & one(int))
end

@inline function Base.setindex(
itr::StaticBitVector{N, U},
bit::Bool,
n::Integer,
) where {N, U}
int_index, bit_offset = int_index_and_bit_offset(U, n)
int = itr.ints[int_index]
int′ = int & ~(one(int) << bit_offset) | U(bit) << bit_offset
ints = Base.setindex(itr.ints, int′, int_index)
return StaticBitVector{N, U}(ints)
end

@inline output_type_for_promotion(::StaticBitVector{<:Any, U}) where {U} =
ConditionalOutputType(Bool, StaticBitVector{<:Any, U})

@inline function unrolled_map_into(
::Type{StaticBitVector{<:Any, U}},
f,
itrs...,
) where {U}
lazy_itr = Iterators.map(f, itrs...)
N = length(lazy_itr)
return StaticBitVector{N, U}(Base.Fix1(generic_getindex, lazy_itr))
end

@inline function unrolled_accumulate_into(
::Type{StaticBitVector{<:Any, U}},
op,
itr,
init,
transform,
) where {U}
N = length(itr)
n_bits_per_int = 8 * sizeof(U)
n_ints = cld(N, n_bits_per_int)
ints = unrolled_accumulate_into_tuple(
StaticOneTo(n_ints);
init = (nothing, init),
transform = first,
) do (_, init_value_for_new_int), int_index
@inline
first_index = n_bits_per_int * (int_index - 1) + 1
unrolled_reduce(
StaticOneTo(min(n_bits_per_int, N - first_index + 1));
init = (zero(U), init_value_for_new_int),
) do (int, prev_value), bit_index
@inline
bit_offset = bit_index - 1
item = generic_getindex(itr, first_index + bit_offset)
new_value =
first_index + bit_offset == 1 && prev_value isa NoInit ?
item : op(prev_value, item)
(int | U(transform(new_value)::Bool) << bit_offset, new_value)
end
end
return StaticBitVector{N, U}(ints)
end

# TODO: Add unrolled_push and unrolled_append

@inline function unrolled_take(
itr::StaticBitVector{<:Any, U},
::Val{N},
) where {N, U}
n_bits_per_int = 8 * sizeof(U)
n_ints = cld(N, n_bits_per_int)
ints = unrolled_take(itr.ints, Val(n_ints))
return StaticBitVector{N, U}(ints)
end

@inline function unrolled_drop(
itr::StaticBitVector{N_old, U},
::Val{N},
) where {N_old, N, U}
n_bits_per_int = 8 * sizeof(U)
n_ints = cld(N_old - N, n_bits_per_int)
n_dropped_ints = length(itr.ints) - n_ints
bit_offset = N - n_bits_per_int * n_dropped_ints
ints_without_offset = unrolled_drop(itr.ints, Val(n_dropped_ints))
ints = if bit_offset == 0
ints_without_offset
else
cur_ints = ints_without_offset
next_ints = unrolled_push(unrolled_drop(cur_ints, Val(1)), nothing)
unrolled_map_into_tuple(cur_ints, next_ints) do cur_int, next_int
@inline
isnothing(next_int) ? cur_int >> bit_offset :
cur_int >> bit_offset | next_int << (n_bits_per_int - bit_offset)
end
end
return StaticBitVector{N_old - N, U}(ints)
end
18 changes: 18 additions & 0 deletions src/StaticOneTo.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
"""
StaticOneTo(N)
A lazy and statically-sized analogue of `Base.OneTo(N)`.
This iterator can only store the integers from 1 to `N`, so its
`output_type_for_promotion` is `NoOutputType()`. An efficient method is provided
for `unrolled_take`, but no other unrolled functions can use `StaticOneTo`s as
output types.
"""
struct StaticOneTo{N} <: StaticSequence{N} end
@inline StaticOneTo(N) = StaticOneTo{N}()

@inline generic_getindex(::StaticOneTo, n) = n

@inline output_type_for_promotion(::StaticOneTo) = NoOutputType()

@inline unrolled_take(::StaticOneTo, ::Val{N}) where {N} = StaticOneTo(N)
Loading

0 comments on commit 3a78132

Please sign in to comment.