Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

store huffman table elements in forward order #94

Merged
merged 1 commit into from
Oct 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions huffman/src/detail/static_vector.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ class static_vector : std::array<T, Capacity>
}

constexpr auto size() const noexcept -> size_type { return size_; }
constexpr auto empty() const noexcept -> size_type { return not size(); }

constexpr auto end() noexcept -> iterator { return begin() + size(); }
constexpr auto end() const noexcept -> const_iterator
Expand Down
56 changes: 22 additions & 34 deletions huffman/src/detail/table_node.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,60 +96,48 @@ class table_node : public encoding<Symbol>
return init_.frequency;
}

constexpr auto node_size() const
{
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-union-access)
return init_.node_size;
}

/// Obtains the next node, with respect to node size
/// Distance to next internal node
///
/// Given a contiguous container of nodes, `next()` returns a pointer to
/// the next node, using `node_size()` to determine if `*this` is an
/// internal node or a leaf node. If `*this` is an internal node, (i.e.
/// `*this` represents a node with children) `next()` skips the appropriate
/// number of elements in the associated container.
/// Given a contiguous container of nodes, `node_size()` returns the distance
/// to the next internal node.
///
/// | freq: 3 | freq: 1 | freq: 1 | freq: 4 | freq: 2 |
/// | ns: 3 | ns: 1 | ns: 1 | ns: 2 | ns: 1 |
/// ^ ^
/// this |
/// |
/// this->next() -----------------+
/// next internal node -----------+
///
/// @{

constexpr auto next() -> table_node* { return this + node_size(); }
constexpr auto next() const -> const table_node*
constexpr auto node_size() const
{
return this + node_size();
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-union-access)
return init_.node_size;
}

/// @}

/// "Joins" two `table_node`s
/// @param lhs left table_node
/// @param rhs right table_node
/// @pre `&lhs + lhs.node_size() == &rhs`
///
/// Logically "join" `*this` with the next adjacent node `*next()`,
/// "creating" an internal node. This adds the frequency of the next node to
/// this node, left pads all the codes of the internal nodes of `*this` with
/// 0s and left pads all the code of the internal nodes of `*next()` with
/// 1s.
/// Logically "join" `lhs` with the next adjacent node `rhs` "creating" an
/// internal node. This adds the frequency of `rhs` to `lhs`, left pads all
/// the codes of the internal nodes of `lhs` with 0s and left pads all the
/// code of the internal nodes of `rhs` with 1s.
///
/// @pre `*this` is not `back()` of the associated container
///
constexpr auto join_with_next() & -> void
friend constexpr auto join(table_node& lhs, table_node& rhs) -> void
{
assert(
&lhs + lhs.node_size() == &rhs and "`lhs` and `rhs` are not adjacent");

const auto left_pad_with = [](auto b) {
return [b](table_node& n) { b >> static_cast<code&>(n); };
};

std::for_each(this, next(), left_pad_with(bit{0}));
std::for_each(next(), next()->next(), left_pad_with(bit{1}));
std::for_each(&lhs, &rhs, left_pad_with(bit{0}));
std::for_each(&rhs, &rhs + rhs.node_size(), left_pad_with(bit{1}));

const auto& n = *next();
// NOLINTBEGIN(cppcoreguidelines-pro-type-union-access)
init_.frequency += n.frequency();
init_.node_size += n.node_size();
lhs.init_.frequency += rhs.frequency();
lhs.init_.node_size += rhs.node_size();
// NOLINTEND(cppcoreguidelines-pro-type-union-access)
}

Expand Down
3 changes: 1 addition & 2 deletions huffman/src/detail/table_storage.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,8 +117,6 @@ class table_storage : table_storage_base_t<IntrusiveNode, Extent>
++it;
}

std::ranges::sort(*this, std::ranges::greater{}, as_code);

assert(std::ranges::unique(*this, {}, as_code).empty());
assert(std::ranges::unique(*this, {}, as_symbol).empty());
}
Expand All @@ -127,6 +125,7 @@ class table_storage : table_storage_base_t<IntrusiveNode, Extent>
using base_type::cbegin;
using base_type::cend;
using base_type::data;
using base_type::empty;
using base_type::end;
using base_type::front;
using base_type::size;
Expand Down
115 changes: 71 additions & 44 deletions huffman/src/table.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,19 @@

detail::table_storage<node_type, Extent> table_;

constexpr static auto find_node_if(auto first, auto last, auto pred)
template <std::unsigned_integral U>
static constexpr auto to_index(U uint)
{
for (; first != last; first = first->next()) {
using S = std::ranges::range_difference_t<decltype(table_)>;

assert(std::cmp_less_equal(uint, std::numeric_limits<S>::max()));
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why std::cmp_less_equal instead of <=?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's a comparison of signed with unsigned integral types and I don't know the integral promotion and conversion rules. With cmp_less_equal, it "will do the right thing". If both operands were signed or both were unsigned integrals, I would use <=. I also partially rely on the compiler to catch these issues with -Wconversion

https://gcc.gnu.org/onlinedocs/gcc/Warning-Options.html#index-Wconversion

There's some decent motivation in the original proposal to add these functions:
https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0586r2.html

return static_cast<S>(uint);
}

template <std::forward_iterator I, std::indirect_unary_predicate<I> P>
constexpr static auto find_node_if(I first, I last, P pred)
{
for (; first != last; first += to_index(first->node_size())) {
if (pred(*first)) {
break;
}
Expand All @@ -55,10 +65,38 @@
return first;
}

constexpr auto encode_symbols() -> void
{
// precondition, audit
assert(std::ranges::is_sorted(table_));

const auto total_size = table_.size();
auto first = table_.begin();
const auto last = table_.end();

while (first->node_size() != total_size) {
join(first[0], first[to_index(first->node_size())]);

const auto has_higher_freq = [f = first->frequency()](const auto& n) {
return n.frequency() > f;
};

auto lower = first + to_index(first->node_size());
auto upper = find_node_if(lower, last, has_higher_freq);

// re-sort after creating a new internal node
std::rotate(first, lower, upper);
}
}

constexpr auto construct_table() -> void
{
using size_type = decltype(table_.size());

if (table_.empty()) {
return;

Check warning on line 97 in huffman/src/table.hpp

View check run for this annotation

Codecov / codecov/patch

huffman/src/table.hpp#L97

Added line #L97 was not covered by tests
}

if (table_.size() == size_type{1}) {
using namespace huffman::literals;
static_cast<code&>(table_.front()) = 0_c;
Expand All @@ -67,27 +105,31 @@

std::ranges::sort(table_);

// precondition
assert(
std::ranges::unique(
table_, [](auto& x, auto& y) { return x.symbol == y.symbol; })
table_, {}, [](const auto& elem) { return elem.symbol; })
.empty() and
"`frequencies` cannot contain duplicate symbols");
"a `table` cannot contain duplicate symbols");

while (table_.front().node_size() != table_.size()) {
table_.front().join_with_next();
const auto frequencies = std::views::transform(
table_, [](const auto& elem) { return elem.frequency(); });
[[maybe_unused]] const auto total_freq =
std::accumulate(std::cbegin(frequencies), std::cend(frequencies), 0UZ);

const auto last = table_.data() + table_.size();
const auto has_higher_freq =
[f = table_.front().frequency()](const auto& n) {
return n.frequency() > f;
};
encode_symbols();

auto lower = table_.front().next();
auto upper = find_node_if(lower, last, has_higher_freq);
// postcondition
assert(total_freq == table_.front().frequency());

// re-sort after creating a new internal node
std::rotate(&table_.front(), lower, upper);
}
// Implicit construction of the Huffman tree results in the least frequent
// symbols at the beginning (largest bitsize) and most frequent at the end
// (smallest bitsize). See details in `table_node.hpp` for how this is
// represented.
//
// Reversing the elements allows code search to start with the symbols with
// the smallest bitsize.
std::ranges::reverse(table_);
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a comment about why

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll add this later:

If front().frequency() does not equal total_freq, we have skipped over nodes during construction of the implicit Huffman tree.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or you feel free to :)

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I meant comment about why the reverse, not about the assertion.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

}

constexpr auto set_skip_fields() -> void
Expand All @@ -96,7 +138,7 @@

const node_type* prev{};

for (auto& elem : table_) {
for (auto& elem : std::views::reverse(table_)) {
const auto s =
skip_type{1} +
(prev and (elem.bitsize() == prev->bitsize())
Expand All @@ -121,10 +163,7 @@
/// Const iterator type
///
using const_iterator = detail::element_base_iterator<
// TODO: construct_table builds the table in reverse order.
// would be nice to fix that so we can get rid of reverse_iterator here.
std::reverse_iterator<
typename detail::table_storage<node_type, Extent>::const_iterator>,
typename detail::table_storage<node_type, Extent>::const_iterator,
encoding<Symbol>>;

/// Constructs an empty table
Expand Down Expand Up @@ -154,16 +193,7 @@
constexpr table(const R& frequencies, std::optional<symbol_type> eot)
: table_{detail::frequency_tag{}, frequencies, eot}
{
[[maybe_unused]] const auto total_freq = std::accumulate(
std::cbegin(frequencies),
std::cend(frequencies),
std::size_t{eot.has_value()},
[](auto acc, auto kv) { return acc + kv.second; });

construct_table();

assert(total_freq == table_.front().frequency());

set_skip_fields();
}

Expand Down Expand Up @@ -247,7 +277,7 @@
[[nodiscard]]
constexpr auto begin() const -> const_iterator
{
return const_iterator{std::make_reverse_iterator(table_.end())};
return const_iterator{table_.begin()};
}

/// Returns an iterator past the last `encoding`
Expand All @@ -256,7 +286,7 @@
[[nodiscard]]
constexpr auto end() const -> const_iterator
{
return const_iterator{std::make_reverse_iterator(table_.begin())};
return const_iterator{table_.end()};
}

/// Finds element with specific code within the code table
Expand Down Expand Up @@ -337,12 +367,9 @@
{
using value_type = decltype(std::declval<code>().value());

// elements are stored in reverse order, so we maintain that order
auto reversed = std::views::reverse(table_);

// set lexicographical order
std::ranges::sort( //
reversed, //
table_, //
[](const auto& x, const auto& y) {
return std::pair{x.bitsize(), std::ref(x.symbol)} <
std::pair{y.bitsize(), std::ref(y.symbol)};
Expand All @@ -356,19 +383,19 @@
auto next_code = code{};

// clang-format off
for (auto& n : reversed) {
assert(next_code.bitsize() <= n.bitsize());
for (auto& elem : table_) {
assert(next_code.bitsize() <= elem.bitsize());

next_code = {
n.bitsize(),
next_code.bitsize() == n.bitsize()
? next_code.value() + value_type{1} // 3) next_code[len]++;
: base_code <<= (n.bitsize() - next_code.bitsize()) // 2) next_code[bits] = code; code = (...) << 1;
elem.bitsize(),
next_code.bitsize() == elem.bitsize()
? next_code.value() + value_type{1} // 3) next_code[len]++;
: base_code <<= (elem.bitsize() - next_code.bitsize()) // 2) next_code[bits] = code; code = (...) << 1;
};

static_cast<code&>(n) = next_code; // 3) tree[n].Code = next_code[len];
static_cast<code&>(elem) = next_code; // 3) tree[n].Code = next_code[len];

++base_code; // 2) (code + bl_count[bits-1])
++base_code; // 2) (code + bl_count[bits-1])
}
// clang-format on

Expand Down
14 changes: 6 additions & 8 deletions huffman/test/decode_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,9 @@
#include <boost/ut.hpp>

#include <array>
#include <climits>
#include <cstddef>
#include <stdexcept>
#include <utility>
#include <vector>

auto main() -> int
{
Expand All @@ -19,7 +17,7 @@ auto main() -> int

test("basic") = [] {
// encoded data from dahuffman readme.rst, but in hex.
constexpr std::array<std::byte, 6> encoded_bytes = {
constexpr std::array encoded_bytes = {
std::byte{0x86},
std::byte{0x7c},
std::byte{0x25},
Expand All @@ -31,12 +29,12 @@ auto main() -> int
static constexpr auto code_table = // clang-format off
huffman::table{
huffman::table_contents,
{std::pair{00000_c, eot},
{00001_c, 'x'},
{0001_c, 'q'},
{001_c, 'n'},
{std::pair{1_c, 'e'},
{01_c, 'i'},
{1_c, 'e'}}
{001_c, 'n'},
{0001_c, 'q'},
{00001_c, 'x'},
{00000_c, eot}}
}; // clang-format on

constexpr std::array expected = {
Expand Down
Loading
Loading