diff --git a/huffman/src/decode.hpp b/huffman/src/decode.hpp index f9178bf..230e032 100644 --- a/huffman/src/decode.hpp +++ b/huffman/src/decode.hpp @@ -21,12 +21,42 @@ namespace starflate::huffman { /// @tparam Symbol The type of the symbols in the code table. /// @tparam Extent The extent of the code table. /// @tparam O The type of the output iterator. -template < - symbol Symbol, - std::size_t Extent = std::dynamic_extent, - std::output_iterator O> +template O> constexpr auto decode(const table& code_table, bit_span bits, O output) -> O +{ + while (!bits.empty()) { + auto result = decode_one(code_table, bits); + if (result.encoded_size == 0) { + break; + } + *output = result.symbol; + output++; + bits.consume(result.encoded_size); + } + return output; +} + +template +struct decode_result +{ + Symbol symbol; + std::uint8_t encoded_size; +}; + +/// Decodes a single symbol from \p bits using \p code_table. +/// +/// @param code_table The code table to use for decoding. +/// @param bits The bit stream to decode. +/// +/// @returns The decoded symbol and how many bits its code was. +/// If no symbol was found, result.encoded_size == 0. +/// @tparam Symbol The type of the symbols in the code table. +/// @tparam Extent The extent of the code table. +template +constexpr auto +decode_one(const table& code_table, bit_span bits) + -> decode_result { code current_code{}; auto code_table_pos = code_table.begin(); @@ -34,17 +64,14 @@ decode(const table& code_table, bit_span bits, O output) -> O current_code << bit; auto found = code_table.find(current_code, code_table_pos); if (found) { - *output = (*found)->symbol; - output++; - code_table_pos = code_table.begin(); - current_code = code{}; - continue; + return {(*found)->symbol, (*found)->bitsize()}; } if (found.error() == code_table.end()) { break; } code_table_pos = found.error(); } - return output; + return {Symbol{}, 0}; } + } // namespace starflate::huffman diff --git a/huffman/test/bit_span_test.cpp b/huffman/test/bit_span_test.cpp index 1ef954a..2100e0c 100644 --- a/huffman/test/bit_span_test.cpp +++ b/huffman/test/bit_span_test.cpp @@ -6,7 +6,6 @@ #include #include #include -#include #include #include @@ -116,6 +115,9 @@ auto main() -> int if (std::cmp_less(n, initial_bits.size())) { expect(nth_bit(n) == bits[0]); } + if (n == 0) { + expect(initial_bits.byte_data() == bits.byte_data()); + } } else { expect(aborts([&] { bits.consume(n); })); } diff --git a/src/decompress.cpp b/src/decompress.cpp index 9a986f4..0207e7d 100644 --- a/src/decompress.cpp +++ b/src/decompress.cpp @@ -1,6 +1,7 @@ #include "decompress.hpp" #include +#include #include #include @@ -31,6 +32,183 @@ auto read_header(huffman::bit_span& compressed_bits) return BlockHeader{final, type}; } +// RFC 3.2.6: static literal/length table +// +// literal/length bitsize code +// ============== ======= ========================= +// 0 - 143 8 0011'0000 - 1011'1111 +// 144 - 255 9 1'1001'0000 - 1'1111'1111 +// 256 - 279 7 000'0000 - 001'0111 +// 280 - 287 8 1100'0000 - 1100'0111 + +constexpr std::size_t fixed_len_table_size = 288; + +constexpr auto fixed_len_table = // clang-format off + huffman::table{ + huffman::symbol_bitsize, + {{{ 0, 143}, 8}, + {{144, 255}, 9}, + {{256, 279}, 7}, + {{280, 287}, 8}}}; +// clang-format on + +constexpr std::size_t fixed_dist_table_size = 32; + +constexpr auto fixed_dist_table = huffman::table< + std::uint16_t, + fixed_dist_table_size>{huffman::symbol_bitsize, {{{0, 31}, 5}}}; + +struct LengthInfo +{ + std::uint8_t extra_bits; + std::uint16_t base; +}; + +constexpr auto lit_or_len_end_of_block = std::uint16_t{256}; +constexpr auto lit_or_len_max = std::uint16_t{285}; +constexpr auto lit_or_len_max_decoded = std::uint16_t{258}; + +// RFC 3.2.5: Compressed blocks (length and distance codes) +constexpr auto length_infos = std::array{ + {{0, 3}, {0, 4}, {0, 5}, {0, 6}, {0, 7}, {0, 8}, {0, 9}, + {0, 10}, {1, 11}, {1, 13}, {1, 15}, {1, 17}, {2, 19}, {2, 23}, + {2, 27}, {2, 31}, {3, 35}, {3, 43}, {3, 51}, {3, 59}, {4, 67}, + {4, 83}, {4, 99}, {4, 115}, {5, 131}, {5, 163}, {5, 195}, {5, 227}}}; + +constexpr auto distance_infos = std::array{ + {{0, 1}, {0, 2}, {0, 3}, {0, 4}, {1, 5}, + {1, 7}, {2, 9}, {2, 13}, {3, 17}, {3, 25}, + {4, 33}, {4, 49}, {5, 65}, {5, 97}, {6, 129}, + {6, 193}, {7, 257}, {7, 385}, {8, 513}, {8, 769}, + {9, 1025}, {9, 1537}, {10, 2049}, {10, 3073}, {11, 4097}, + {11, 6145}, {12, 8193}, {12, 12289}, {13, 16385}, {13, 24577}}}; + +/// Removes n bits from the beginning of bits and returns them. +/// +/// @pre bits contains at least n bits. +/// @pre n <= 16 +/// +/// @returns the n bits removed from the beginning of this. +/// The bits are in the lower (rightmost) part of the return value. +/// +auto pop_extra_bits(huffman::bit_span& bits, std::uint8_t n) -> std::uint16_t +{ + assert(n <= 16); + auto iter = bits.begin(); + std::uint16_t res{}; + for (std::uint8_t i{}; i < n; i++) { + res |= static_cast( + static_cast(static_cast(*iter)) << i); + iter += 1; + } + bits.consume(n); // invalidates iter, so must come after the loop + return res; +} + +enum class ParseLitOrLenStatus : std::uint8_t +{ + EndOfBlock, + Error, +}; + +auto parse_lit_or_len( + std::uint16_t lit_or_len, huffman::bit_span& src_bits) -> std:: + expected, ParseLitOrLenStatus> +{ + if (lit_or_len < detail::lit_or_len_end_of_block) { + return static_cast(lit_or_len); + } + if (lit_or_len == detail::lit_or_len_end_of_block) { + return std::unexpected{ParseLitOrLenStatus::EndOfBlock}; + } + if (lit_or_len > detail::lit_or_len_max) { + return std::unexpected{ParseLitOrLenStatus::Error}; + } + std::uint16_t len{}; + if (lit_or_len == detail::lit_or_len_max) { + len = detail::lit_or_len_max_decoded; + } else { + const auto len_idx = + static_cast(lit_or_len - detail::lit_or_len_end_of_block - 1); + // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-constant-array-index) + const auto& len_info = detail::length_infos[len_idx]; + const auto extra_len = pop_extra_bits(src_bits, len_info.extra_bits); + len = len_info.base + extra_len; + } + return len; +} + +auto decompress_block_huffman( + huffman::bit_span& src_bits, + std::span dst, + std::ptrdiff_t& dst_written, + const huffman::table& len_table, + const huffman::table& dist_table) + -> DecompressStatus +{ + while (true) { + const auto lit_or_len_decoded = huffman::decode_one(len_table, src_bits); + if (not lit_or_len_decoded.encoded_size) { + return DecompressStatus::InvalidLitOrLen; + } + src_bits.consume(lit_or_len_decoded.encoded_size); + const auto maybe_lit_or_len = + parse_lit_or_len(lit_or_len_decoded.symbol, src_bits); + if (not maybe_lit_or_len) { + if (maybe_lit_or_len.error() == ParseLitOrLenStatus::EndOfBlock) { + return DecompressStatus::Success; + } + return DecompressStatus::InvalidLitOrLen; + } + const auto lit_or_len = maybe_lit_or_len.value(); + if (std::holds_alternative(lit_or_len)) { + if (dst.size() - static_cast(dst_written) < 1) { + return DecompressStatus::DstTooSmall; + } + dst[static_cast(dst_written++)] = std::get(lit_or_len); + continue; + } + // It's not a literal, so handle length and distance + const auto len = std::get(lit_or_len); + const auto dist_decoded = huffman::decode_one(dist_table, src_bits); + const auto dist_code = dist_decoded.symbol; + if (not dist_decoded.encoded_size) { + return DecompressStatus::InvalidDistance; + } + src_bits.consume(dist_decoded.encoded_size); + if (dist_code >= detail::distance_infos.size()) { + return DecompressStatus::InvalidLitOrLen; + } + // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-constant-array-index) + const auto& dist_info = detail::distance_infos[dist_code]; + const std::uint16_t distance = + dist_info.base + pop_extra_bits(src_bits, dist_info.extra_bits); + if (distance > dst_written) { + return DecompressStatus::InvalidDistance; + } + if (dst.size() - static_cast(dst_written) < len) { + return DecompressStatus::DstTooSmall; + } + starflate::detail::copy_from_before( + distance, dst.begin() + dst_written, len); + dst_written += len; + } + return DecompressStatus::Success; +} + +/// Copy n bytes from distance bytes before dst to dst. +void copy_from_before( + std::uint16_t distance, std::span::iterator dst, std::uint16_t n) +{ + std::ptrdiff_t n_signed{n}; + const auto src = dst - distance; + while (n_signed > 0) { + const auto n_to_copy = std::min(n_signed, dst - src); + dst = std::copy_n(src, n_to_copy, dst); + n_signed -= n_to_copy; + } +} + } // namespace detail auto decompress(std::span src, std::span dst) @@ -39,7 +217,8 @@ auto decompress(std::span src, std::span dst) using enum detail::BlockType; huffman::bit_span src_bits{src}; - // std::size_t dst_written{}; + // will always be > 0, but signed type to minimize conversions. + std::ptrdiff_t dst_written{}; for (bool was_final = false; not was_final;) { const auto header = detail::read_header(src_bits); if (not header) { @@ -60,22 +239,27 @@ auto decompress(std::span src, std::span dst) return DecompressStatus::SrcTooSmall; } - if (dst.size() < len) { + if (dst.size() - static_cast(dst_written) < len) { return DecompressStatus::DstTooSmall; } - std::copy_n(src_bits.byte_data(), len, dst.begin()); + std::copy_n(src_bits.byte_data(), len, dst.begin() + dst_written); src_bits.consume(CHAR_BIT * len); - dst = dst.subspan(len); - // dst_written += len; + dst_written += len; + } else if (header->type == FixedHuffman) { + const auto block_status = detail::decompress_block_huffman( + src_bits, + dst, + dst_written, + detail::fixed_len_table, + detail::fixed_dist_table); + if (block_status != DecompressStatus::Success) { + return block_status; + } } else { // TODO: implement return DecompressStatus::Error; } - const auto distance = - std::distance(std::ranges::data(src), src_bits.byte_data()); - assert(distance >= 0 and "distance must be positive"); - src = src.subspan(static_cast(distance)); } return DecompressStatus::Success; } diff --git a/src/decompress.hpp b/src/decompress.hpp index 07f88f4..52f7305 100644 --- a/src/decompress.hpp +++ b/src/decompress.hpp @@ -2,6 +2,7 @@ #include "huffman/huffman.hpp" +#include #include #include #include @@ -18,6 +19,8 @@ enum class DecompressStatus : std::uint8_t NoCompressionLenMismatch, DstTooSmall, SrcTooSmall, + InvalidLitOrLen, + InvalidDistance, }; namespace detail { @@ -37,6 +40,19 @@ struct BlockHeader auto read_header(huffman::bit_span& compressed_bits) -> std::expected; + +/// Copies n bytes from (dst - distance) to dst, handling overlap by repeating. +/// +/// From RFC 3.2.3: +/// the referenced string may overlap the current position; for example, if the +/// last 2 bytes decoded have values X and Y, a string reference with +/// adds X,Y,X,Y,X to the output stream. +/// +/// @pre dst - distance is valid. +void copy_from_before( + std::uint16_t distance, + std::span::iterator dst, + std::uint16_t n); } // namespace detail /// Decompresses the given source data into the destination buffer. diff --git a/src/test/BUILD.bazel b/src/test/BUILD.bazel index 5579a95..f8b3a27 100644 --- a/src/test/BUILD.bazel +++ b/src/test/BUILD.bazel @@ -6,6 +6,7 @@ cc_test( timeout = "short", srcs = ["decompress_test.cpp"], data = [ + ":starfleet.html", ":starfleet.html.dynamic", ":starfleet.html.fixed", ], diff --git a/src/test/decompress_test.cpp b/src/test/decompress_test.cpp index c80083f..0f56712 100644 --- a/src/test/decompress_test.cpp +++ b/src/test/decompress_test.cpp @@ -28,8 +28,12 @@ auto read_runfile(const char* argv0, const std::string& path) const std::string abs_path{runfiles->Rlocation(path)}; std::ifstream file{abs_path, std::ios::binary}; - ::boost::ut::expect(::boost::ut::fatal(file.is_open())) - << "failed to open " << path; + if (not file.is_open()) { + // ::boost::ut::fatal swallows log messages, so log before. + ::boost::ut::log("failed to open file: " + abs_path); + ::boost::ut::expect(::boost::ut::fatal(false)); + } + std::vector chars( (std::istreambuf_iterator(file)), std::istreambuf_iterator()); @@ -50,7 +54,6 @@ auto main(int, char* argv[]) -> int { using ::boost::ut::eq; using ::boost::ut::expect; - using ::boost::ut::fatal; using ::boost::ut::test; using namespace starflate; @@ -137,6 +140,15 @@ auto main(int, char* argv[]) -> int expect(header.has_value()) << "got error: " << static_cast(header.error()); expect(header->type == detail::BlockType::FixedHuffman); + + const std::vector expected_bytes = + read_runfile(*argv, "starflate/src/test/starfleet.html"); + std::vector dst(expected_bytes.size()); + const auto status = decompress(input_bytes, dst); + expect(status == DecompressStatus::Success) + << "got error code: " << static_cast(status); + expect(std::ranges::equal(dst, expected_bytes)) + << "decompressed does not match expected"; }; test("dynamic huffman") = [argv] { @@ -149,4 +161,11 @@ auto main(int, char* argv[]) -> int << "got error: " << static_cast(header.error()); expect(header->type == detail::BlockType::DynamicHuffman); }; + + test("copy_from_before") = [] { + auto src_and_dst = huffman::byte_array(1, 2, 0, 0, 0, 0); + const auto dst_span = std::span{src_and_dst}.subspan(2); + detail::copy_from_before(2, dst_span.begin(), 3); + expect(eq(src_and_dst, huffman::byte_array(1, 2, 1, 2, 1, 0))); + }; };