Skip to content

Commit

Permalink
store huffman table elements in forward order (#94)
Browse files Browse the repository at this point in the history
Remove use of std::reverse_iterator when iterating over table elements.
Instead, invoke reverse on the underlying table storage after
construction.

Change-Id: I45916a591f51298fff8b4012b9507b781021100e
  • Loading branch information
oliverlee authored Oct 12, 2023
1 parent 8dd46b1 commit 9098e1c
Show file tree
Hide file tree
Showing 9 changed files with 207 additions and 181 deletions.
1 change: 1 addition & 0 deletions huffman/src/detail/static_vector.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ class static_vector : std::array<T, Capacity>
}

constexpr auto size() const noexcept -> size_type { return size_; }
constexpr auto empty() const noexcept -> size_type { return not size(); }

constexpr auto end() noexcept -> iterator { return begin() + size(); }
constexpr auto end() const noexcept -> const_iterator
Expand Down
56 changes: 22 additions & 34 deletions huffman/src/detail/table_node.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,60 +96,48 @@ class table_node : public encoding<Symbol>
return init_.frequency;
}

constexpr auto node_size() const
{
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-union-access)
return init_.node_size;
}

/// Obtains the next node, with respect to node size
/// Distance to next internal node
///
/// Given a contiguous container of nodes, `next()` returns a pointer to
/// the next node, using `node_size()` to determine if `*this` is an
/// internal node or a leaf node. If `*this` is an internal node, (i.e.
/// `*this` represents a node with children) `next()` skips the appropriate
/// number of elements in the associated container.
/// Given a contiguous container of nodes, `node_size()` returns the distance
/// to the next internal node.
///
/// | freq: 3 | freq: 1 | freq: 1 | freq: 4 | freq: 2 |
/// | ns: 3 | ns: 1 | ns: 1 | ns: 2 | ns: 1 |
/// ^ ^
/// this |
/// |
/// this->next() -----------------+
/// next internal node -----------+
///
/// @{

constexpr auto next() -> table_node* { return this + node_size(); }
constexpr auto next() const -> const table_node*
constexpr auto node_size() const
{
return this + node_size();
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-union-access)
return init_.node_size;
}

/// @}

/// "Joins" two `table_node`s
/// @param lhs left table_node
/// @param rhs right table_node
/// @pre `&lhs + lhs.node_size() == &rhs`
///
/// Logically "join" `*this` with the next adjacent node `*next()`,
/// "creating" an internal node. This adds the frequency of the next node to
/// this node, left pads all the codes of the internal nodes of `*this` with
/// 0s and left pads all the code of the internal nodes of `*next()` with
/// 1s.
/// Logically "join" `lhs` with the next adjacent node `rhs` "creating" an
/// internal node. This adds the frequency of `rhs` to `lhs`, left pads all
/// the codes of the internal nodes of `lhs` with 0s and left pads all the
/// code of the internal nodes of `rhs` with 1s.
///
/// @pre `*this` is not `back()` of the associated container
///
constexpr auto join_with_next() & -> void
friend constexpr auto join(table_node& lhs, table_node& rhs) -> void
{
assert(
&lhs + lhs.node_size() == &rhs and "`lhs` and `rhs` are not adjacent");

const auto left_pad_with = [](auto b) {
return [b](table_node& n) { b >> static_cast<code&>(n); };
};

std::for_each(this, next(), left_pad_with(bit{0}));
std::for_each(next(), next()->next(), left_pad_with(bit{1}));
std::for_each(&lhs, &rhs, left_pad_with(bit{0}));
std::for_each(&rhs, &rhs + rhs.node_size(), left_pad_with(bit{1}));

const auto& n = *next();
// NOLINTBEGIN(cppcoreguidelines-pro-type-union-access)
init_.frequency += n.frequency();
init_.node_size += n.node_size();
lhs.init_.frequency += rhs.frequency();
lhs.init_.node_size += rhs.node_size();
// NOLINTEND(cppcoreguidelines-pro-type-union-access)
}

Expand Down
3 changes: 1 addition & 2 deletions huffman/src/detail/table_storage.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,8 +117,6 @@ class table_storage : table_storage_base_t<IntrusiveNode, Extent>
++it;
}

std::ranges::sort(*this, std::ranges::greater{}, as_code);

assert(std::ranges::unique(*this, {}, as_code).empty());
assert(std::ranges::unique(*this, {}, as_symbol).empty());
}
Expand All @@ -127,6 +125,7 @@ class table_storage : table_storage_base_t<IntrusiveNode, Extent>
using base_type::cbegin;
using base_type::cend;
using base_type::data;
using base_type::empty;
using base_type::end;
using base_type::front;
using base_type::size;
Expand Down
115 changes: 71 additions & 44 deletions huffman/src/table.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,19 @@ class table

detail::table_storage<node_type, Extent> table_;

constexpr static auto find_node_if(auto first, auto last, auto pred)
template <std::unsigned_integral U>
static constexpr auto to_index(U uint)
{
for (; first != last; first = first->next()) {
using S = std::ranges::range_difference_t<decltype(table_)>;

assert(std::cmp_less_equal(uint, std::numeric_limits<S>::max()));
return static_cast<S>(uint);
}

template <std::forward_iterator I, std::indirect_unary_predicate<I> P>
constexpr static auto find_node_if(I first, I last, P pred)
{
for (; first != last; first += to_index(first->node_size())) {
if (pred(*first)) {
break;
}
Expand All @@ -55,10 +65,38 @@ class table
return first;
}

constexpr auto encode_symbols() -> void
{
// precondition, audit
assert(std::ranges::is_sorted(table_));

const auto total_size = table_.size();
auto first = table_.begin();
const auto last = table_.end();

while (first->node_size() != total_size) {
join(first[0], first[to_index(first->node_size())]);

const auto has_higher_freq = [f = first->frequency()](const auto& n) {
return n.frequency() > f;
};

auto lower = first + to_index(first->node_size());
auto upper = find_node_if(lower, last, has_higher_freq);

// re-sort after creating a new internal node
std::rotate(first, lower, upper);
}
}

constexpr auto construct_table() -> void
{
using size_type = decltype(table_.size());

if (table_.empty()) {
return;
}

if (table_.size() == size_type{1}) {
using namespace huffman::literals;
static_cast<code&>(table_.front()) = 0_c;
Expand All @@ -67,27 +105,31 @@ class table

std::ranges::sort(table_);

// precondition
assert(
std::ranges::unique(
table_, [](auto& x, auto& y) { return x.symbol == y.symbol; })
table_, {}, [](const auto& elem) { return elem.symbol; })
.empty() and
"`frequencies` cannot contain duplicate symbols");
"a `table` cannot contain duplicate symbols");

while (table_.front().node_size() != table_.size()) {
table_.front().join_with_next();
const auto frequencies = std::views::transform(
table_, [](const auto& elem) { return elem.frequency(); });
[[maybe_unused]] const auto total_freq =
std::accumulate(std::cbegin(frequencies), std::cend(frequencies), 0UZ);

const auto last = table_.data() + table_.size();
const auto has_higher_freq =
[f = table_.front().frequency()](const auto& n) {
return n.frequency() > f;
};
encode_symbols();

auto lower = table_.front().next();
auto upper = find_node_if(lower, last, has_higher_freq);
// postcondition
assert(total_freq == table_.front().frequency());

// re-sort after creating a new internal node
std::rotate(&table_.front(), lower, upper);
}
// Implicit construction of the Huffman tree results in the least frequent
// symbols at the beginning (largest bitsize) and most frequent at the end
// (smallest bitsize). See details in `table_node.hpp` for how this is
// represented.
//
// Reversing the elements allows code search to start with the symbols with
// the smallest bitsize.
std::ranges::reverse(table_);
}

constexpr auto set_skip_fields() -> void
Expand All @@ -96,7 +138,7 @@ class table

const node_type* prev{};

for (auto& elem : table_) {
for (auto& elem : std::views::reverse(table_)) {
const auto s =
skip_type{1} +
(prev and (elem.bitsize() == prev->bitsize())
Expand All @@ -121,10 +163,7 @@ class table
/// Const iterator type
///
using const_iterator = detail::element_base_iterator<
// TODO: construct_table builds the table in reverse order.
// would be nice to fix that so we can get rid of reverse_iterator here.
std::reverse_iterator<
typename detail::table_storage<node_type, Extent>::const_iterator>,
typename detail::table_storage<node_type, Extent>::const_iterator,
encoding<Symbol>>;

/// Constructs an empty table
Expand Down Expand Up @@ -154,16 +193,7 @@ class table
constexpr table(const R& frequencies, std::optional<symbol_type> eot)
: table_{detail::frequency_tag{}, frequencies, eot}
{
[[maybe_unused]] const auto total_freq = std::accumulate(
std::cbegin(frequencies),
std::cend(frequencies),
std::size_t{eot.has_value()},
[](auto acc, auto kv) { return acc + kv.second; });

construct_table();

assert(total_freq == table_.front().frequency());

set_skip_fields();
}

Expand Down Expand Up @@ -247,7 +277,7 @@ class table
[[nodiscard]]
constexpr auto begin() const -> const_iterator
{
return const_iterator{std::make_reverse_iterator(table_.end())};
return const_iterator{table_.begin()};
}

/// Returns an iterator past the last `encoding`
Expand All @@ -256,7 +286,7 @@ class table
[[nodiscard]]
constexpr auto end() const -> const_iterator
{
return const_iterator{std::make_reverse_iterator(table_.begin())};
return const_iterator{table_.end()};
}

/// Finds element with specific code within the code table
Expand Down Expand Up @@ -337,12 +367,9 @@ class table
{
using value_type = decltype(std::declval<code>().value());

// elements are stored in reverse order, so we maintain that order
auto reversed = std::views::reverse(table_);

// set lexicographical order
std::ranges::sort( //
reversed, //
table_, //
[](const auto& x, const auto& y) {
return std::pair{x.bitsize(), std::ref(x.symbol)} <
std::pair{y.bitsize(), std::ref(y.symbol)};
Expand All @@ -356,19 +383,19 @@ class table
auto next_code = code{};

// clang-format off
for (auto& n : reversed) {
assert(next_code.bitsize() <= n.bitsize());
for (auto& elem : table_) {
assert(next_code.bitsize() <= elem.bitsize());

next_code = {
n.bitsize(),
next_code.bitsize() == n.bitsize()
? next_code.value() + value_type{1} // 3) next_code[len]++;
: base_code <<= (n.bitsize() - next_code.bitsize()) // 2) next_code[bits] = code; code = (...) << 1;
elem.bitsize(),
next_code.bitsize() == elem.bitsize()
? next_code.value() + value_type{1} // 3) next_code[len]++;
: base_code <<= (elem.bitsize() - next_code.bitsize()) // 2) next_code[bits] = code; code = (...) << 1;
};

static_cast<code&>(n) = next_code; // 3) tree[n].Code = next_code[len];
static_cast<code&>(elem) = next_code; // 3) tree[n].Code = next_code[len];

++base_code; // 2) (code + bl_count[bits-1])
++base_code; // 2) (code + bl_count[bits-1])
}
// clang-format on

Expand Down
14 changes: 6 additions & 8 deletions huffman/test/decode_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,9 @@
#include <boost/ut.hpp>

#include <array>
#include <climits>
#include <cstddef>
#include <stdexcept>
#include <utility>
#include <vector>

auto main() -> int
{
Expand All @@ -19,7 +17,7 @@ auto main() -> int

test("basic") = [] {
// encoded data from dahuffman readme.rst, but in hex.
constexpr std::array<std::byte, 6> encoded_bytes = {
constexpr std::array encoded_bytes = {
std::byte{0x86},
std::byte{0x7c},
std::byte{0x25},
Expand All @@ -31,12 +29,12 @@ auto main() -> int
static constexpr auto code_table = // clang-format off
huffman::table{
huffman::table_contents,
{std::pair{00000_c, eot},
{00001_c, 'x'},
{0001_c, 'q'},
{001_c, 'n'},
{std::pair{1_c, 'e'},
{01_c, 'i'},
{1_c, 'e'}}
{001_c, 'n'},
{0001_c, 'q'},
{00001_c, 'x'},
{00000_c, eot}}
}; // clang-format on

constexpr std::array expected = {
Expand Down
Loading

0 comments on commit 9098e1c

Please sign in to comment.