Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support decompressing fixed huffman blocks #124

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 37 additions & 10 deletions huffman/src/decode.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,30 +21,57 @@
/// @tparam Symbol The type of the symbols in the code table.
/// @tparam Extent The extent of the code table.
/// @tparam O The type of the output iterator.
template <
symbol Symbol,
std::size_t Extent = std::dynamic_extent,
std::output_iterator<Symbol> O>
template <symbol Symbol, std::size_t Extent, std::output_iterator<Symbol> O>
constexpr auto
decode(const table<Symbol, Extent>& code_table, bit_span bits, O output) -> O
{
while (!bits.empty()) {
auto result = decode_one(code_table, bits);
if (result.encoded_size == 0) {
break;
}
*output = result.symbol;
output++;
bits.consume(result.encoded_size);
garymm marked this conversation as resolved.
Show resolved Hide resolved
}
return output;
}

Check warning on line 38 in huffman/src/decode.hpp

View check run for this annotation

Codecov / codecov/patch

huffman/src/decode.hpp#L27-L38

Added lines #L27 - L38 were not covered by tests

template <symbol Symbol>
struct decode_result
{
Symbol symbol;
std::uint8_t encoded_size;
};

/// Decodes a single symbol from \p bits using \p code_table.
///
/// @param code_table The code table to use for decoding.
/// @param bits The bit stream to decode.
///
/// @returns The decoded symbol and how many bits its code was.
/// If no symbol was found, result.encoded_size == 0.
/// @tparam Symbol The type of the symbols in the code table.
/// @tparam Extent The extent of the code table.
template <symbol Symbol, std::size_t Extent>
constexpr auto
decode_one(const table<Symbol, Extent>& code_table, bit_span bits)
-> decode_result<Symbol>
{
code current_code{};
auto code_table_pos = code_table.begin();
for (auto bit : bits) {
current_code << bit;
auto found = code_table.find(current_code, code_table_pos);
if (found) {
*output = (*found)->symbol;
output++;
code_table_pos = code_table.begin();
current_code = code{};
continue;
return {(*found)->symbol, (*found)->bitsize()};
}
if (found.error() == code_table.end()) {
break;
}
code_table_pos = found.error();
}
return output;
return {Symbol{}, 0};

Check warning on line 74 in huffman/src/decode.hpp

View check run for this annotation

Codecov / codecov/patch

huffman/src/decode.hpp#L74

Added line #L74 was not covered by tests
oliverlee marked this conversation as resolved.
Show resolved Hide resolved
}

} // namespace starflate::huffman
4 changes: 3 additions & 1 deletion huffman/test/bit_span_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
#include <array>
#include <climits>
#include <cstdint>
#include <numeric>
#include <ranges>
#include <vector>

Expand Down Expand Up @@ -116,6 +115,9 @@ auto main() -> int
if (std::cmp_less(n, initial_bits.size())) {
expect(nth_bit(n) == bits[0]);
}
if (n == 0) {
expect(initial_bits.byte_data() == bits.byte_data());
}
} else {
expect(aborts([&] { bits.consume(n); }));
}
Expand Down
202 changes: 193 additions & 9 deletions src/decompress.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "decompress.hpp"

#include <cstdint>
#include <iostream>
#include <iterator>
#include <utility>

Expand Down Expand Up @@ -31,6 +32,183 @@
return BlockHeader{final, type};
}

// RFC 3.2.6: static literal/length table
//
// literal/length bitsize code
// ============== ======= =========================
// 0 - 143 8 0011'0000 - 1011'1111
// 144 - 255 9 1'1001'0000 - 1'1111'1111
// 256 - 279 7 000'0000 - 001'0111
// 280 - 287 8 1100'0000 - 1100'0111

constexpr std::size_t fixed_len_table_size = 288;

constexpr auto fixed_len_table = // clang-format off
huffman::table<std::uint16_t, fixed_len_table_size>{
huffman::symbol_bitsize,
{{{ 0, 143}, 8},
{{144, 255}, 9},
{{256, 279}, 7},
{{280, 287}, 8}}};
// clang-format on

constexpr std::size_t fixed_dist_table_size = 32;

constexpr auto fixed_dist_table = huffman::table<
std::uint16_t,
fixed_dist_table_size>{huffman::symbol_bitsize, {{{0, 31}, 5}}};

struct LengthInfo
{
std::uint8_t extra_bits;
std::uint16_t base;
};

constexpr auto lit_or_len_end_of_block = std::uint16_t{256};
constexpr auto lit_or_len_max = std::uint16_t{285};
constexpr auto lit_or_len_max_decoded = std::uint16_t{258};

// RFC 3.2.5: Compressed blocks (length and distance codes)
constexpr auto length_infos = std::array<LengthInfo, 28>{
{{0, 3}, {0, 4}, {0, 5}, {0, 6}, {0, 7}, {0, 8}, {0, 9},
{0, 10}, {1, 11}, {1, 13}, {1, 15}, {1, 17}, {2, 19}, {2, 23},
{2, 27}, {2, 31}, {3, 35}, {3, 43}, {3, 51}, {3, 59}, {4, 67},
{4, 83}, {4, 99}, {4, 115}, {5, 131}, {5, 163}, {5, 195}, {5, 227}}};

constexpr auto distance_infos = std::array<LengthInfo, 30>{
{{0, 1}, {0, 2}, {0, 3}, {0, 4}, {1, 5},
{1, 7}, {2, 9}, {2, 13}, {3, 17}, {3, 25},
{4, 33}, {4, 49}, {5, 65}, {5, 97}, {6, 129},
{6, 193}, {7, 257}, {7, 385}, {8, 513}, {8, 769},
{9, 1025}, {9, 1537}, {10, 2049}, {10, 3073}, {11, 4097},
{11, 6145}, {12, 8193}, {12, 12289}, {13, 16385}, {13, 24577}}};

/// Removes n bits from the beginning of bits and returns them.
///
/// @pre bits contains at least n bits.
/// @pre n <= 16
///
/// @returns the n bits removed from the beginning of this.
/// The bits are in the lower (rightmost) part of the return value.
///
auto pop_extra_bits(huffman::bit_span& bits, std::uint8_t n) -> std::uint16_t
{
assert(n <= 16);
auto iter = bits.begin();
std::uint16_t res{};
for (std::uint8_t i{}; i < n; i++) {
res |= static_cast<std::uint16_t>(
static_cast<std::uint16_t>(static_cast<bool>(*iter)) << i);
iter += 1;
}
bits.consume(n); // invalidates iter, so must come after the loop
return res;
}

enum class ParseLitOrLenStatus : std::uint8_t
{
EndOfBlock,
Error,
};

auto parse_lit_or_len(
std::uint16_t lit_or_len, huffman::bit_span& src_bits) -> std::
expected<std::variant<std::byte, std::uint16_t>, ParseLitOrLenStatus>
{
if (lit_or_len < detail::lit_or_len_end_of_block) {
return static_cast<std::byte>(lit_or_len);
}
if (lit_or_len == detail::lit_or_len_end_of_block) {
return std::unexpected{ParseLitOrLenStatus::EndOfBlock};
}
if (lit_or_len > detail::lit_or_len_max) {
return std::unexpected{ParseLitOrLenStatus::Error};
}

Check warning on line 126 in src/decompress.cpp

View check run for this annotation

Codecov / codecov/patch

src/decompress.cpp#L125-L126

Added lines #L125 - L126 were not covered by tests
std::uint16_t len{};
if (lit_or_len == detail::lit_or_len_max) {
len = detail::lit_or_len_max_decoded;
} else {
Comment on lines +127 to +130
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems like you can just return instead of construction + assignment of len?

if (lit_or_len == detail::lit_or_len_max) {
    return detail::lit_or_len_max_decoded;
}

const auto len_idx =
static_cast<size_t>(lit_or_len - detail::lit_or_len_end_of_block - 1);
// NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-constant-array-index)
const auto& len_info = detail::length_infos[len_idx];
const auto extra_len = pop_extra_bits(src_bits, len_info.extra_bits);
len = len_info.base + extra_len;
}
return len;
}

auto decompress_block_huffman(
huffman::bit_span& src_bits,
std::span<std::byte> dst,
std::ptrdiff_t& dst_written,
Comment on lines +143 to +144
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
std::span<std::byte> dst,
std::ptrdiff_t& dst_written,
std::span<std::byte>& dst,

const huffman::table<std::uint16_t, fixed_len_table_size>& len_table,
const huffman::table<std::uint16_t, fixed_dist_table_size>& dist_table)
-> DecompressStatus
{
while (true) {
oliverlee marked this conversation as resolved.
Show resolved Hide resolved
const auto lit_or_len_decoded = huffman::decode_one(len_table, src_bits);
if (not lit_or_len_decoded.encoded_size) {
return DecompressStatus::InvalidLitOrLen;
}

Check warning on line 153 in src/decompress.cpp

View check run for this annotation

Codecov / codecov/patch

src/decompress.cpp#L152-L153

Added lines #L152 - L153 were not covered by tests
Comment on lines +150 to +153
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think if src_bits doesn't have enough bits to decode a symbol - that would be "success" and this function should exit?

If decode_one is changed to return expected<>, maybe something like this?

const auto lit_or_len = huffman::decode_one(len_table, src_bits);
if (not lit_or_len) {
  return status_for(lit_or_len.error());
}

If src_bits doesn't have enough bits, you do have have a partially constructed code and iterator in the code table. You could potentially keep those in order to skip some computation when more bits arrive - although it's certainly simpler to just restart.

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you thinking of a chunked / streaming input?
If not, I don't understand why not having enough bits would be success?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I'm thinking of the chunked/streaming case.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably worth adding a TODO to handle later.

src_bits.consume(lit_or_len_decoded.encoded_size);
const auto maybe_lit_or_len =
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks better to me.

I think this could still benefit from breaking out smaller functions - although if you think that's too much work, feel free to ignore.

  const auto done = 
    parse_lit_or_len(lit_or_len_decoded.symbol, src_bits)
      .and_then(decode_lit_or_len_into(dst))
      .or_else(convert_error_status)
      
   if (done) {
     return *done;
   }

   // done is empty, so the the loop continues

where convert_error_status: ParseLitOrLenStatus -> optional<DecompressStatus>

return {in_place, e == EndOfBlock ? Success : InvalidLitOrLen};

and decode_lit_or_len_into uses the poor version of pattern matching available in C++:

auto decode_lit_or_len_into(std::span<std::byte>& dst)
{
  return [&dst](variant<std::byte, std::uint16_t> lit_or_len) {
    return visit(
      overloaded({
        [&dst](byte lit) { return decode_into(lit, dst); },
        [&dst](uint16_t len) { return decode_into(lit, dst); }
      }),
      lit_or_len
    );
  };
}

// https://en.cppreference.com/w/cpp/utility/variant/visit
template <class... Ts>
struct overloaded : Ts... { using Ts::operator()...; };

and

auto decode_into(byte lit, span& dst)
  -> optional<DecompressStatus>
{
  // lines 133-137
}
auto decode_into(uint16_t len, span& dst)
  -> optional<DecompressStatus>
{
 // lines 139-161
}

although that's a bit of a simplification - you'll also need to capture dst_written.

The different cases are broken out into different functions, although there's some logic inversion with variant::visit.

parse_lit_or_len(lit_or_len_decoded.symbol, src_bits);
if (not maybe_lit_or_len) {
if (maybe_lit_or_len.error() == ParseLitOrLenStatus::EndOfBlock) {
return DecompressStatus::Success;
}
return DecompressStatus::InvalidLitOrLen;

Check warning on line 161 in src/decompress.cpp

View check run for this annotation

Codecov / codecov/patch

src/decompress.cpp#L161

Added line #L161 was not covered by tests
}
const auto lit_or_len = maybe_lit_or_len.value();
if (std::holds_alternative<std::byte>(lit_or_len)) {
if (dst.size() - static_cast<std::size_t>(dst_written) < 1) {
return DecompressStatus::DstTooSmall;
}

Check warning on line 167 in src/decompress.cpp

View check run for this annotation

Codecov / codecov/patch

src/decompress.cpp#L166-L167

Added lines #L166 - L167 were not covered by tests
dst[static_cast<size_t>(dst_written++)] = std::get<std::byte>(lit_or_len);
continue;
}
// It's not a literal, so handle length and distance
const auto len = std::get<std::uint16_t>(lit_or_len);
const auto dist_decoded = huffman::decode_one(dist_table, src_bits);
const auto dist_code = dist_decoded.symbol;
if (not dist_decoded.encoded_size) {
return DecompressStatus::InvalidDistance;
}

Check warning on line 177 in src/decompress.cpp

View check run for this annotation

Codecov / codecov/patch

src/decompress.cpp#L176-L177

Added lines #L176 - L177 were not covered by tests
src_bits.consume(dist_decoded.encoded_size);
if (dist_code >= detail::distance_infos.size()) {
return DecompressStatus::InvalidLitOrLen;
}

Check warning on line 181 in src/decompress.cpp

View check run for this annotation

Codecov / codecov/patch

src/decompress.cpp#L180-L181

Added lines #L180 - L181 were not covered by tests
// NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-constant-array-index)
const auto& dist_info = detail::distance_infos[dist_code];
const std::uint16_t distance =
dist_info.base + pop_extra_bits(src_bits, dist_info.extra_bits);
if (distance > dst_written) {
return DecompressStatus::InvalidDistance;
}

Check warning on line 188 in src/decompress.cpp

View check run for this annotation

Codecov / codecov/patch

src/decompress.cpp#L187-L188

Added lines #L187 - L188 were not covered by tests
if (dst.size() - static_cast<std::size_t>(dst_written) < len) {
garymm marked this conversation as resolved.
Show resolved Hide resolved
return DecompressStatus::DstTooSmall;
}

Check warning on line 191 in src/decompress.cpp

View check run for this annotation

Codecov / codecov/patch

src/decompress.cpp#L190-L191

Added lines #L190 - L191 were not covered by tests
starflate::detail::copy_from_before(
distance, dst.begin() + dst_written, len);
dst_written += len;
}
return DecompressStatus::Success;

Check warning on line 196 in src/decompress.cpp

View check run for this annotation

Codecov / codecov/patch

src/decompress.cpp#L196

Added line #L196 was not covered by tests
}

/// Copy n bytes from distance bytes before dst to dst.
void copy_from_before(
std::uint16_t distance, std::span<std::byte>::iterator dst, std::uint16_t n)
{
std::ptrdiff_t n_signed{n};
const auto src = dst - distance;
while (n_signed > 0) {
const auto n_to_copy = std::min(n_signed, dst - src);
oliverlee marked this conversation as resolved.
Show resolved Hide resolved
dst = std::copy_n(src, n_to_copy, dst);
n_signed -= n_to_copy;
}
}

} // namespace detail

auto decompress(std::span<const std::byte> src, std::span<std::byte> dst)
Expand All @@ -39,7 +217,8 @@
using enum detail::BlockType;

huffman::bit_span src_bits{src};
// std::size_t dst_written{};
// will always be > 0, but signed type to minimize conversions.
std::ptrdiff_t dst_written{};
for (bool was_final = false; not was_final;) {
const auto header = detail::read_header(src_bits);
if (not header) {
Expand All @@ -60,22 +239,27 @@
return DecompressStatus::SrcTooSmall;
}

if (dst.size() < len) {
if (dst.size() - static_cast<std::size_t>(dst_written) < len) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if (dst.size() - static_cast<std::size_t>(dst_written) < len) {
if (dst.size() < len) {

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC this comment is irrelevant unless we drpo the prefix of dst every time as suggested below.

return DecompressStatus::DstTooSmall;
}

std::copy_n(src_bits.byte_data(), len, dst.begin());
std::copy_n(src_bits.byte_data(), len, dst.begin() + dst_written);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
std::copy_n(src_bits.byte_data(), len, dst.begin() + dst_written);
std::copy_n(src_bits.byte_data(), len, dst.begin());
dst = dst | std::views::drop(len);

views::drop has a specialization for span which can be used to trim off the front
https://en.cppreference.com/w/cpp/ranges/drop_view

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we advance dst, my guess would be it's UB to do dst.begin() - distance. I need to go backwards. If you have a better suggestion LMK.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It will be UB if the pointer goes off the front of the original buffer.

buf = array<byte, 8>{};
buf: [-|-|-|-|-|-|-|-]

dst = span{buf}
dst: [-|-|-|-|-|-|-|-]

dst = dst | drop(4)
dst:         [-|-|-|-]

it1 = dst.begin()
buf: [-|-|-|-|-|-|-|-]
it1:          ^ 

it2 = it1 - 2
buf: [-|-|-|-|-|-|-|-]
it2:      ^

Maybe it would make sense to create a destination_buffer type that handles this logic and catches potential UB.

As for how to define the destination buffer, that could depend on how you want to handle growth.

Would it act like a std::vector where it allocates a new buffer and copies elements over? In this case, maybe destination_buffer stores a pointer and 2 integers.

Or would it maintain a list of chunks so growth doesn't require copying elements from the old buffer to a new buffer? I'm not sure which approach is better, but it would probably help to abstract this part so the rest of the decompression implementation is independent.

src_bits.consume(CHAR_BIT * len);
dst = dst.subspan(len);
// dst_written += len;
dst_written += len;
} else if (header->type == FixedHuffman) {
const auto block_status = detail::decompress_block_huffman(
src_bits,
dst,
dst_written,
detail::fixed_len_table,
detail::fixed_dist_table);
if (block_status != DecompressStatus::Success) {
return block_status;
}

Check warning on line 258 in src/decompress.cpp

View check run for this annotation

Codecov / codecov/patch

src/decompress.cpp#L257-L258

Added lines #L257 - L258 were not covered by tests
} else {
// TODO: implement
return DecompressStatus::Error;
}
const auto distance =
std::distance(std::ranges::data(src), src_bits.byte_data());
assert(distance >= 0 and "distance must be positive");
src = src.subspan(static_cast<size_t>(distance));
}
return DecompressStatus::Success;
}
Expand Down
16 changes: 16 additions & 0 deletions src/decompress.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#include "huffman/huffman.hpp"

#include <array>
#include <cstddef>
#include <expected>
#include <ranges>
Expand All @@ -18,6 +19,8 @@ enum class DecompressStatus : std::uint8_t
NoCompressionLenMismatch,
DstTooSmall,
SrcTooSmall,
InvalidLitOrLen,
InvalidDistance,
};

namespace detail {
Expand All @@ -37,6 +40,19 @@ struct BlockHeader

auto read_header(huffman::bit_span& compressed_bits)
-> std::expected<BlockHeader, DecompressStatus>;

/// Copies n bytes from (dst - distance) to dst, handling overlap by repeating.
///
/// From RFC 3.2.3:
/// the referenced string may overlap the current position; for example, if the
/// last 2 bytes decoded have values X and Y, a string reference with
/// <length = 5, distance = 2> adds X,Y,X,Y,X to the output stream.
///
/// @pre dst - distance is valid.
void copy_from_before(
std::uint16_t distance,
std::span<std::byte>::iterator dst,
std::uint16_t n);
} // namespace detail

/// Decompresses the given source data into the destination buffer.
Expand Down
1 change: 1 addition & 0 deletions src/test/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ cc_test(
timeout = "short",
srcs = ["decompress_test.cpp"],
data = [
":starfleet.html",
":starfleet.html.dynamic",
":starfleet.html.fixed",
],
Expand Down
Loading
Loading