-
Notifications
You must be signed in to change notification settings - Fork 1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
construct huffman table from symbol spans #97
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,197 @@ | ||
#pragma once | ||
|
||
#include "huffman/src/code.hpp" | ||
#include "huffman/src/detail/is_specialization_of.hpp" | ||
#include "huffman/src/symbol_span.hpp" | ||
|
||
#include <concepts> | ||
#include <cstddef> | ||
#include <cstdint> | ||
#include <iterator> | ||
#include <numeric> | ||
#include <ranges> | ||
#include <tuple> | ||
#include <utility> | ||
|
||
namespace starflate::huffman::detail { | ||
|
||
/// Range adaptor that flattens a view of symbol_span-bitsize ranges to a single | ||
/// view | ||
/// @tparam V random access view of symbol_span-bitsize elements | ||
/// | ||
/// Flattens a view of symbol_span-bitsize elements. Elements of the flattened | ||
/// view are symbol-code pairs. Note that code values set by this view are | ||
/// *unspecified*. It is expected that view is used in construction of a `table` | ||
/// and that valid codes are set with the `canonicalize()` member. | ||
/// | ||
/// Example: | ||
/// ~~~{.cpp} | ||
/// const auto data = std::vector<std::pair<symbol_span<char>, std::uint8_t>>{ | ||
/// {{'A', 'C'}, 2}, | ||
/// {{'D', 'F'}, 3}, | ||
/// {{'G', 'H'}, 4}, | ||
/// }; | ||
/// | ||
/// const auto elements = | ||
/// flattened_symbol_bitsize_view{std::ranges::ref_view{data}}; | ||
/// | ||
/// for (auto [symbol, code] : elements) { | ||
/// std::cout << symbol << ", " code << '\n'; | ||
/// } | ||
/// | ||
/// // prints | ||
/// // A, 00 | ||
/// // B, 00 | ||
/// // C, 00 | ||
/// // D, 000 | ||
/// // E, 000 | ||
/// // F, 000 | ||
/// // G, 0000 | ||
/// // H, 0000 | ||
/// ~~~ | ||
/// | ||
template <std::ranges::view V> | ||
requires std::ranges::random_access_range<V> and | ||
(std::tuple_size_v<std::ranges::range_value_t<V>> == 2) and | ||
is_specialization_of_v< | ||
std::tuple_element_t<0, std::ranges::range_value_t<V>>, | ||
symbol_span> and | ||
std::convertible_to< | ||
std::tuple_element_t<1, std::ranges::range_value_t<V>>, | ||
std::uint8_t> | ||
class flattened_symbol_bitsize_view | ||
: public std::ranges::view_interface<flattened_symbol_bitsize_view<V>> | ||
{ | ||
V base_; | ||
std::size_t size_; | ||
|
||
public: | ||
using base_type = V; | ||
using symbol_span_type = | ||
std::tuple_element_t<0, std::ranges::range_value_t<base_type>>; | ||
using symbol_type = typename symbol_span_type::symbol_type; | ||
|
||
/// Iterator for flattened_symbol_bitsize_view | ||
/// | ||
class iterator | ||
{ | ||
public: | ||
using iterator_category = std::forward_iterator_tag; | ||
using difference_type = std::ranges::range_difference_t<V>; | ||
using reference = std::pair<code, symbol_type>; | ||
using value_type = reference; | ||
using pointer = void; | ||
|
||
private: | ||
const flattened_symbol_bitsize_view* parent_{}; | ||
difference_type outer_index_{}; | ||
std::ranges::range_difference_t<symbol_span_type> inner_index_{}; | ||
|
||
template <class R, class T> | ||
static constexpr auto in_range_cast(T t) -> R | ||
{ | ||
assert(std::in_range<R>(t)); | ||
return static_cast<R>(t); | ||
} | ||
|
||
public: | ||
/// Default constructor | ||
/// | ||
iterator() | ||
{ | ||
// this should never be invoked, but the definition is required | ||
// for weakly_incrementable until GCC and Clang are fixed | ||
// https://en.cppreference.com/w/cpp/iterator/weakly_incrementable | ||
// https://wg21.link/P2325R3 | ||
assert(false); | ||
} | ||
|
||
/// Construct an iterator to a symbol-bitsize element | ||
/// @param parent view containing the symbol_span-bitsize sequence | ||
/// @param outer_index index specifying the symbol_span-bitsize element | ||
/// @param inner_index index within a symbol_span-bitsize element | ||
/// | ||
constexpr iterator( | ||
const flattened_symbol_bitsize_view& parent, | ||
// NOLINTBEGIN(bugprone-easily-swappable-parameters) | ||
std::size_t outer_index, | ||
std::size_t inner_index) | ||
// NOLINTEND(bugprone-easily-swappable-parameters) | ||
: parent_{&parent}, | ||
outer_index_{in_range_cast<difference_type>(outer_index)}, | ||
inner_index_{in_range_cast< | ||
std::ranges::range_difference_t<symbol_span_type>>(inner_index)} | ||
{} | ||
|
||
[[nodiscard]] | ||
constexpr auto | ||
operator*() const -> reference | ||
{ | ||
const auto [symbols, bitsize] = parent_->base()[outer_index_]; | ||
return {code{bitsize, {}}, symbols[inner_index_]}; | ||
} | ||
|
||
constexpr auto operator++() & -> iterator& | ||
{ | ||
const auto [symbols, _] = parent_->base()[outer_index_]; | ||
|
||
// if we've reached the end of a symbol_span | ||
if (std::cmp_equal(++inner_index_, symbols.size())) { | ||
// advance to the next symbol_span | ||
++outer_index_; | ||
// and reset the symbol_span index | ||
inner_index_ = {}; | ||
} | ||
|
||
return *this; | ||
} | ||
|
||
constexpr auto operator++(int) -> iterator | ||
{ | ||
auto tmp = *this; | ||
++*this; | ||
return tmp; | ||
} | ||
|
||
friend constexpr auto | ||
operator==(const iterator&, const iterator&) -> bool = default; | ||
}; | ||
|
||
constexpr explicit flattened_symbol_bitsize_view(V base) | ||
: base_{std::move(base)}, | ||
size_{std::accumulate( | ||
std::ranges::cbegin(base_), | ||
std::ranges::cend(base_), | ||
0UZ, | ||
[](auto acc, const auto& symbol_bitsize) { | ||
const auto& [symbols, _] = symbol_bitsize; | ||
return acc + symbols.size(); | ||
})} | ||
{} | ||
|
||
[[nodiscard]] | ||
constexpr auto base() const -> base_type | ||
{ | ||
return base_; | ||
} | ||
|
||
[[nodiscard]] | ||
constexpr auto size() const -> std::size_t | ||
{ | ||
return size_; | ||
} | ||
|
||
[[nodiscard]] | ||
constexpr auto begin() const -> iterator | ||
{ | ||
return {*this, 0UZ, 0UZ}; | ||
} | ||
|
||
[[nodiscard]] | ||
constexpr auto end() const -> iterator | ||
{ | ||
return {*this, base().size(), 0UZ}; | ||
} | ||
}; | ||
|
||
} // namespace starflate::huffman::detail |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
#pragma once | ||
|
||
#include <type_traits> | ||
|
||
namespace starflate::huffman::detail { | ||
|
||
/// Checks if a type is a specialization of a class template | ||
/// @tparam T type | ||
/// @tparam primary class template | ||
/// | ||
/// @see https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p2098r1.pdf | ||
/// | ||
/// @{ | ||
|
||
template <class T, template <class...> class primary> | ||
struct is_specialization_of : std::false_type | ||
{}; | ||
|
||
template <template <class...> class primary, class... Args> | ||
struct is_specialization_of<primary<Args...>, primary> : std::true_type | ||
{}; | ||
|
||
template <class T, template <class...> class primary> | ||
inline constexpr auto is_specialization_of_v = | ||
is_specialization_of<T, primary>::value; | ||
|
||
/// @} | ||
|
||
} // namespace starflate::huffman::detail |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
#pragma once | ||
|
||
#include "huffman/src/utility.hpp" | ||
|
||
#include <ranges> | ||
|
||
namespace starflate::huffman { | ||
|
||
/// An inclusive span of symbols | ||
/// | ||
template <symbol S> | ||
class symbol_span : public std::ranges::view_interface<symbol_span<S>> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What's the difference between this and There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It pretty much is the same as It just has a slightly different constructor. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK, the only difference being end is inclusive rather than exclusive? |
||
{ | ||
S first_; | ||
S last_; | ||
|
||
using range_type = std::ranges::iota_view<S, S>; | ||
|
||
constexpr auto as_range() const -> range_type | ||
{ | ||
return range_type{first_, static_cast<S>(last_ + S{1})}; | ||
} | ||
|
||
public: | ||
using symbol_type = S; | ||
|
||
using iterator = std::ranges::iterator_t<range_type>; | ||
|
||
/// Construct a symbol span of a single symbol | ||
/// @param first symbol | ||
/// | ||
constexpr symbol_span(symbol_type first) : symbol_span{first, first} {} | ||
|
||
/// Construct a symbol span from first to last, inclusive | ||
/// @param first, last inclusive symbol range | ||
/// @pre first <= last | ||
/// | ||
constexpr symbol_span(symbol_type first, symbol_type last) | ||
: first_{first}, last_{last} | ||
{ | ||
assert(first <= last); | ||
} | ||
|
||
[[nodiscard]] | ||
constexpr auto begin() const -> iterator | ||
{ | ||
return as_range().begin(); | ||
} | ||
|
||
[[nodiscard]] | ||
constexpr auto end() const -> iterator | ||
{ | ||
return as_range().end(); | ||
} | ||
}; | ||
|
||
} // namespace starflate::huffman |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could we instead add a little complexity to the table constructor and avoid this class?
Could we have a new tag for a constructor to differentiate the constructor that takes symbol-bitsize pairs and the one that takes symbol-range-bitsize pairs?
I'm honestly a bit lost in what's going on here, which doesn't mean it's wrong, but I want to explore if we can make it a bit easier for a noob like me to follow.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure let's look into that. I'll give it some thought tomorrow or this weekend.
At a high level, we get a range of ranges:
and we want to flatten to:
This view provides a lazy flatten so we don't need to actually construct the flattened range of all the subranges.
Note that the subranges (
symbol_span
s) don't actually store all the elements but are instead are generators.We're essentially doing this: https://docs.python.org/3/library/itertools.html#itertools.chain
for a collection of
symbol_span
s.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I had some thoughts about this and I'll try to write it out tomorrow.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Currently,
table
constructors take a range and potentially a tag, and pass that range to the table_storage's constructor. The table_storage's constructor is responsible for taking that range and filling up the internal container with "valid enough" codes and symbols. Once the table_storage's constructor is complete, no elements can be added or removed, as members such as resize and emplace_back are not exposed upwards. After table_storage's constructor is complete, we get to the function body of table's constructor which is responsible for building the Huffman tree, setting skip fields, and then canonicalizing the codes. table_storage is responsible for adding elements, table is responsible for moving them around.I was thinking about passing the symbol-span+count range down to the table_storage constructor and doing nested for loops. If we're able to reuse existing constructors, then we can focus on reviewing the correctness of the range adapter code, which admittedly requires understanding the range abstractions. But I think this will be pretty useful given that this library is focusing on handling streams of data and we'll likely run into other cases where a range abstraction is useful, even after switching to execution on the GPU. I am very much in the no raw loops camp.
I've added more documentation to the
flattened_symbol_bitsize_view
class template. I've also removed theto_code_symbol
function and instead do the equivalent inflattened_symbol_bitsize_view::iterator::operator*()
. Hopefully things are a bit clearer but let me know if there's anything I can explain or we can also try another approach.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The thing that threw me off was the use of concepts to figure out whether to use this flattened_symbol_bitsize_view, and I was suggesting using a constructor tag intead. I was not objecting to the existence of this class.
But I guess from a caller's point of view this is nicer, and you've already figured it out so I guess we can keep it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry I misunderstood. But yeah, I'll generally try to keep the interface simpler for the caller at the expense of more complexity in the implementation.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How is your understanding of the view class? I don't mind going over it if there are things you don't understand.