From b368df486c67469289287f763ac05c29c230d396 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Sun, 12 May 2024 23:10:34 +0200 Subject: [PATCH 01/50] Added support for the legacy compressions (shrink/reduce/implode). --- Cargo.toml | 1 + src/compression.rs | 37 ++- src/legacy/bitstream.rs | 156 ++++++++++++ src/legacy/huffman.rs | 208 +++++++++++++++ src/legacy/implode.rs | 317 +++++++++++++++++++++++ src/legacy/lz77.rs | 10 + src/legacy/mod.rs | 9 + src/legacy/reduce.rs | 486 ++++++++++++++++++++++++++++++++++++ src/legacy/shrink.rs | 426 +++++++++++++++++++++++++++++++ src/lib.rs | 2 + src/read.rs | 96 ++++++- src/types.rs | 3 + src/write.rs | 13 + tests/data/folder/first.txt | 1 + tests/data/implode.zip | Bin 0 -> 800 bytes tests/data/reduce.zip | Bin 0 -> 1058 bytes tests/data/shrink.zip | Bin 0 -> 825 bytes tests/legacy_zip.rs | 55 ++++ 18 files changed, 1817 insertions(+), 3 deletions(-) create mode 100644 src/legacy/bitstream.rs create mode 100644 src/legacy/huffman.rs create mode 100644 src/legacy/implode.rs create mode 100644 src/legacy/lz77.rs create mode 100644 src/legacy/mod.rs create mode 100644 src/legacy/reduce.rs create mode 100644 src/legacy/shrink.rs create mode 100644 tests/data/folder/first.txt create mode 100644 tests/data/implode.zip create mode 100644 tests/data/reduce.zip create mode 100644 tests/data/shrink.zip create mode 100644 tests/legacy_zip.rs diff --git a/Cargo.toml b/Cargo.toml index cf36d0041..7e31a0b03 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -72,6 +72,7 @@ deflate-zlib = ["flate2/zlib", "_deflate-any"] deflate-zlib-ng = ["flate2/zlib-ng", "_deflate-any"] deflate-zopfli = ["zopfli", "_deflate-any"] lzma = ["lzma-rs/stream"] +legacy-zip = [] unreserved = [] default = [ "aes-crypto", diff --git a/src/compression.rs b/src/compression.rs index 3dd6eced2..dd2ba25d1 100644 --- a/src/compression.rs +++ b/src/compression.rs @@ -38,6 +38,16 @@ pub enum CompressionMethod { /// Compress the file using LZMA #[cfg(feature = "lzma")] Lzma, + + /// Legacy format + #[cfg(feature = "legacy-zip")] + Shrink, + /// Reduce (Method 2-5) + #[cfg(feature = "legacy-zip")] + Reduce(u8), + /// Method 6 Implode/explode + #[cfg(feature = "legacy-zip")] + Implode, /// Unsupported compression method #[cfg_attr( not(fuzzing), @@ -49,11 +59,17 @@ pub enum CompressionMethod { /// All compression methods defined for the ZIP format impl CompressionMethod { pub const STORE: Self = CompressionMethod::Stored; - pub const SHRINK: Self = CompressionMethod::Unsupported(1); + #[cfg(feature = "legacy-zip")] + pub const SHRINK: Self = CompressionMethod::Shrink; + #[cfg(feature = "legacy-zip")] pub const REDUCE_1: Self = CompressionMethod::Unsupported(2); + #[cfg(feature = "legacy-zip")] pub const REDUCE_2: Self = CompressionMethod::Unsupported(3); + #[cfg(feature = "legacy-zip")] pub const REDUCE_3: Self = CompressionMethod::Unsupported(4); + #[cfg(feature = "legacy-zip")] pub const REDUCE_4: Self = CompressionMethod::Unsupported(5); + #[cfg(feature = "legacy-zip")] pub const IMPLODE: Self = CompressionMethod::Unsupported(6); #[cfg(feature = "_deflate-any")] pub const DEFLATE: Self = CompressionMethod::Deflated; @@ -99,6 +115,18 @@ impl CompressionMethod { #[allow(deprecated)] match val { 0 => CompressionMethod::Stored, + #[cfg(feature = "legacy-zip")] + 1 => CompressionMethod::Shrink, + #[cfg(feature = "legacy-zip")] + 2 => CompressionMethod::Reduce(1), + #[cfg(feature = "legacy-zip")] + 3 => CompressionMethod::Reduce(2), + #[cfg(feature = "legacy-zip")] + 4 => CompressionMethod::Reduce(3), + #[cfg(feature = "legacy-zip")] + 5 => CompressionMethod::Reduce(4), + #[cfg(feature = "legacy-zip")] + 6 => CompressionMethod::Implode, #[cfg(feature = "_deflate-any")] 8 => CompressionMethod::Deflated, #[cfg(feature = "deflate64")] @@ -125,6 +153,13 @@ impl CompressionMethod { #[allow(deprecated)] match self { CompressionMethod::Stored => 0, + #[cfg(feature = "legacy-zip")] + CompressionMethod::Shrink => 1, + #[cfg(feature = "legacy-zip")] + CompressionMethod::Reduce(n) => 1 + n as u16, + #[cfg(feature = "legacy-zip")] + CompressionMethod::Implode => 6, + #[cfg(feature = "_deflate-any")] CompressionMethod::Deflated => 8, #[cfg(feature = "deflate64")] diff --git a/src/legacy/bitstream.rs b/src/legacy/bitstream.rs new file mode 100644 index 000000000..3fdb55ee9 --- /dev/null +++ b/src/legacy/bitstream.rs @@ -0,0 +1,156 @@ +/// Get the n least significant bits of x. +pub fn lsb(x: u64, n: u8) -> u64 { + assert!(n <= 63); + x & ((1u64.wrapping_shl(n as u32)) - 1u64) +} + +/// Reverse the n least significant bits of x. +/// The (16 - n) most significant bits of the result will be zero. +pub fn reverse16(x: u16, n: usize) -> u16 { + assert!(n > 0); + assert!(n <= 16); + return x.reverse_bits() >> (16 - n); +} + +/* +pub fn round_up(x: usize, m: usize) -> usize { + assert!((m & (m - 1)) == 0, "m must be a power of two"); + (x + m - 1) & (-(m as i64)) as usize // Hacker's Delight (2nd), 3-1. +} +*/ +/// Input bitstream. +pub struct BitStream<'a> { + src: &'a [u8], /* Source bytes. */ + bitpos: usize, /* Position of the next bit to read. */ + bitpos_end: usize, /* Position of past-the-end bit. */ +} + +/// Initialize an input stream to present the n bytes from src as an LSB-first +/// bitstream. +impl<'a> BitStream<'a> { + pub fn new(src: &'a [u8], n: usize) -> Self { + Self { + src, + bitpos: 0, + bitpos_end: n * 8, + } + } + + /// Get the next bits from the input stream. The number of bits returned is + /// between ISTREAM_MIN_BITS and 64, depending on the position in the stream, or + /// fewer if the end of stream is reached. The upper bits are zero-padded. + pub fn bits(&mut self) -> u64 { + let next = self.bitpos / 8; + assert!(next < self.src.len(), "Cannot read past end of stream."); + + let bits = if next + 8 <= self.src.len() { + // Common case: read 8 bytes in one go. + u64::from_le_bytes(self.src[next..next + 8].try_into().unwrap()) + } else { + // Read the available bytes and zero-pad. + let mut bits = 0; + for i in 0..self.src.len() - next { + bits |= (self.src[next + i] as u64).wrapping_shl(i as u32 * 8); + } + bits + }; + + return bits >> (self.bitpos % 8); + } + + /// Advance n bits in the bitstream if possible. Returns false if that many bits + /// are not available in the stream. + pub fn advance(&mut self, n: u8) -> bool { + assert!(self.bitpos <= self.bitpos_end); + + if self.bitpos_end - self.bitpos < n as usize { + return false; + } + + self.bitpos += n as usize; + return true; + } + + /// Align the input stream to the next 8-bit boundary and return a pointer to + /// that byte, which may be the past-the-end-of-stream byte. + pub fn _byte_align(&mut self) -> usize { + assert!(self.bitpos <= self.bitpos_end, "Not past end of stream."); + self.bitpos = 8 * (self.bitpos / 8); + assert!(self.bitpos <= self.bitpos_end, "Not past end of stream."); + return self.bitpos / 8; + } + + pub fn bytes_read(&self) -> usize { + (self.bitpos + 7) / 8 + } +} + +pub const ISTREAM_MIN_BITS: usize = 64 - 7; + +#[cfg(test)] +mod tests { + use crate::legacy::bitstream::{lsb, reverse16}; + + #[test] + fn test_reverse16() { + assert_eq!(reverse16(0x0000, 1), 0x0); + assert_eq!(reverse16(0xffff, 1), 0x1); + assert_eq!(reverse16(0x0000, 16), 0x0); + assert_eq!(reverse16(0xffff, 16), 0xffff); + // 0001 0010 0011 0100 -> 0010 1100 0100 1000 + assert_eq!(reverse16(0x1234, 16), 0x2c48); + // 111 1111 0100 0001 -> 100 0001 0111 1111 + assert_eq!(reverse16(0x7f41, 15), 0x417f); + } + /* + #[test] + fn test_bits_round_up() { + assert_eq!(round_up(0, 4), 0); + assert_eq!(round_up(1, 4), 4); + assert_eq!(round_up(2, 4), 4); + assert_eq!(round_up(3, 4), 4); + assert_eq!(round_up(4, 4), 4); + assert_eq!(round_up(5, 4), 8); + }*/ + + #[test] + fn test_bits_test_bits_lsbround_up() { + assert_eq!(lsb(0x1122334455667788, 0), 0x0); + assert_eq!(lsb(0x1122334455667788, 5), 0x8); + assert_eq!(lsb(0x7722334455667788, 63), 0x7722334455667788); + } + + #[test] + fn test_istream_basic() { + let bits = [0x47]; + let mut is = super::BitStream::new(&bits, 1); + + assert_eq!(lsb(is.bits(), 1), 1); + assert!(is.advance(1)); + assert_eq!(lsb(is.bits(), 1), 1); + assert!(is.advance(1)); + assert_eq!(lsb(is.bits(), 1), 1); + assert!(is.advance(1)); + assert_eq!(lsb(is.bits(), 1), 0); + assert!(is.advance(1)); + assert_eq!(lsb(is.bits(), 1), 0); + assert!(is.advance(1)); + assert_eq!(lsb(is.bits(), 1), 0); + assert!(is.advance(1)); + assert_eq!(lsb(is.bits(), 1), 1); + assert!(is.advance(1)); + assert_eq!(lsb(is.bits(), 1), 0); + assert!(is.advance(1)); + } + + #[test] + fn test_istream_case1() { + let bits = [0x45, 048]; + let mut is = super::BitStream::new(&bits, 9); + assert_eq!(lsb(is.bits(), 3), 0x05); + assert!(is.advance(3)); + + assert_eq!(lsb(is.bits(), 4), 0x08); + assert!(is.advance(4)); + } +} diff --git a/src/legacy/huffman.rs b/src/legacy/huffman.rs new file mode 100644 index 000000000..c61988469 --- /dev/null +++ b/src/legacy/huffman.rs @@ -0,0 +1,208 @@ +use crate::legacy::bitstream::reverse16; + +use super::bitstream::lsb; + +#[derive(Default, Clone, Copy)] +pub struct TableEntry { + /// Wide enough to fit the max symbol nbr. + pub sym: u16, + /// 0 means no symbol. + pub len: u8, +} + +/// Deflate uses max 288 symbols. +const MAX_HUFFMAN_SYMBOLS: usize = 288; +/// Implode uses max 16-bit codewords. +const MAX_HUFFMAN_BITS: usize = 16; +/// Seems a good trade-off. +const HUFFMAN_LOOKUP_TABLE_BITS: u8 = 8; + +pub struct HuffmanDecoder { + /// Lookup table for fast decoding of short codewords. + pub table: [TableEntry; 1 << HUFFMAN_LOOKUP_TABLE_BITS], + /// "Sentinel bits" value for each codeword length. + pub sentinel_bits: [u32; MAX_HUFFMAN_BITS + 1], + /// First symbol index minus first codeword mod 2**16 for each length. + pub offset_first_sym_idx: [u16; MAX_HUFFMAN_BITS + 1], + /// Map from symbol index to symbol. + pub syms: [u16; MAX_HUFFMAN_SYMBOLS], + // num_syms:usize +} + +impl Default for HuffmanDecoder { + fn default() -> Self { + let syms = [0; MAX_HUFFMAN_SYMBOLS]; + let table = [TableEntry::default(); 1 << HUFFMAN_LOOKUP_TABLE_BITS]; + Self { + table, + sentinel_bits: Default::default(), + offset_first_sym_idx: Default::default(), + syms, + } + } +} + +/// Initialize huffman decoder d for a code defined by the n codeword lengths. +/// Returns false if the codeword lengths do not correspond to a valid prefix +/// code. +impl HuffmanDecoder { + pub fn init(&mut self, lengths: &[u8], n: usize) -> bool { + let mut count = [0; MAX_HUFFMAN_BITS + 1]; + let mut code = [0; MAX_HUFFMAN_BITS + 1]; + let mut sym_idx = [0; MAX_HUFFMAN_BITS + 1]; + // Zero-initialize the lookup table. + for t in &mut self.table { + t.len = 0; + } + + // Count the number of codewords of each length. + for i in 0..n { + assert!(lengths[i] as usize <= MAX_HUFFMAN_BITS); + count[lengths[i] as usize] += 1; + } + count[0] = 0; // Ignore zero-length codewords. + + // Compute sentinel_bits and offset_first_sym_idx for each length. + code[0] = 0; + sym_idx[0] = 0; + for l in 1..=MAX_HUFFMAN_BITS { + // First canonical codeword of this length. + code[l] = ((code[l - 1] + count[l - 1]) << 1) as u16; + + if count[l] != 0 && code[l] as u32 + count[l] as u32 - 1 > (1u32 << l) - 1 { + // The last codeword is longer than l bits. + return false; + } + + let s = ((code[l] as u32 + count[l] as u32) << (MAX_HUFFMAN_BITS - l)) as u32; + self.sentinel_bits[l] = s; + assert!(self.sentinel_bits[l] >= code[l] as u32, "No overflow!"); + + sym_idx[l] = sym_idx[l - 1] + count[l - 1]; + self.offset_first_sym_idx[l] = sym_idx[l].wrapping_sub(code[l]); + } + + // Build mapping from index to symbol and populate the lookup table. + for i in 0..n { + let l = lengths[i] as usize; + if l == 0 { + continue; + } + + self.syms[sym_idx[l] as usize] = i as u16; + sym_idx[l] += 1; + + if l <= HUFFMAN_LOOKUP_TABLE_BITS as usize { + self.table_insert(i, l, code[l]); + code[l] += 1; + } + } + + true + } + + pub fn table_insert(&mut self, sym: usize, len: usize, codeword: u16) { + assert!(len <= HUFFMAN_LOOKUP_TABLE_BITS as usize); + + let codeword = reverse16(codeword, len); // Make it LSB-first. + let pad_len = HUFFMAN_LOOKUP_TABLE_BITS as usize - len; + + // Pad the pad_len upper bits with all bit combinations. + for padding in 0..(1 << pad_len) { + let index = (codeword | (padding << len)) as usize; + assert!(sym <= u16::MAX as usize); + self.table[index].sym = sym as u16; + assert!(len <= u8::MAX as usize); + self.table[index].len = len as u8; + } + } + + /// Use the decoder d to decode a symbol from the LSB-first zero-padded bits. + /// Returns the decoded symbol number or -1 if no symbol could be decoded. + /// *num_used_bits will be set to the number of bits used to decode the symbol, + /// or zero if no symbol could be decoded. + pub fn huffman_decode(&mut self, bits: u16, num_used_bits: &mut u8) -> i32 { + // First try the lookup table. + let lookup_bits = lsb(bits as u64, HUFFMAN_LOOKUP_TABLE_BITS) as usize; + assert!(lookup_bits < self.table.len()); + + if self.table[lookup_bits].len != 0 { + assert!(self.table[lookup_bits].len <= HUFFMAN_LOOKUP_TABLE_BITS); + // assert!(self.table[lookup_bits].sym < self.num_syms); + *num_used_bits = self.table[lookup_bits].len; + return self.table[lookup_bits].sym as i32; + } + + // Then do canonical decoding with the bits in MSB-first order. + let mut bits = reverse16(bits, MAX_HUFFMAN_BITS); + for l in HUFFMAN_LOOKUP_TABLE_BITS as usize + 1..=MAX_HUFFMAN_BITS { + if (bits as u32) < self.sentinel_bits[l] { + bits >>= MAX_HUFFMAN_BITS - l; + + let sym_idx = (self.offset_first_sym_idx[l] as usize + bits as usize) & 0xFFFF; + //assert(sym_idx < self.num_syms); + + *num_used_bits = l as u8; + return self.syms[sym_idx] as i32; + } + } + *num_used_bits = 0; + -1 + } +} + +#[cfg(test)] +mod tests { + use super::HuffmanDecoder; + + #[test] + fn test_huffman_decode_basic() { + let lens = [ + 3, // sym 0: 000 + 3, // sym 1: 001 + 3, // sym 2: 010 + 3, // sym 3: 011 + 3, // sym 4: 100 + 3, // sym 5: 101 + 4, // sym 6: 1100 + 4, // sym 7: 1101 + 0, // sym 8: + 0, // sym 9: + 0, // sym 10: + 0, // sym 11: + 0, // sym 12: + 0, // sym 13: + 0, // sym 14: + 0, // sym 15: + 6, // sym 16: 111110 + 5, // sym 17: 11110 + 4, // sym 18: 1110 + ]; + + let mut d = HuffmanDecoder::default(); + assert!(d.init(&lens, lens.len())); + + let mut used = 0; + // 000 (msb-first) -> 000 (lsb-first) + assert_eq!(d.huffman_decode(0x0, &mut used), 0); + assert_eq!(used, 3); + + /* 011 (msb-first) -> 110 (lsb-first)*/ + assert_eq!(d.huffman_decode(0x6, &mut used), 3); + assert_eq!(used, 3); + + /* 11110 (msb-first) -> 01111 (lsb-first)*/ + assert_eq!(d.huffman_decode(0x0f, &mut used), 17); + assert_eq!(used, 5); + + /* 111110 (msb-first) -> 011111 (lsb-first)*/ + assert_eq!(d.huffman_decode(0x1f, &mut used), 16); + assert_eq!(used, 6); + + /* 1111111 (msb-first) -> 1111111 (lsb-first)*/ + assert_eq!(d.huffman_decode(0x7f, &mut used), -1); + + /* Make sure used is set even when decoding fails. */ + assert_eq!(used, 0); + } +} diff --git a/src/legacy/implode.rs b/src/legacy/implode.rs new file mode 100644 index 000000000..5ee3e252e --- /dev/null +++ b/src/legacy/implode.rs @@ -0,0 +1,317 @@ +use std::collections::VecDeque; +use std::io::{self, copy, Read, Result}; + +use thiserror::Error; + +use crate::legacy::bitstream::{lsb, ISTREAM_MIN_BITS}; +use crate::legacy::lz77::lz77_output_backref; + +use super::bitstream::BitStream; +use super::huffman::HuffmanDecoder; +//const COMPRESSED_BYTES_TO_BUFFER: usize = 4096; + +#[derive(Error, Debug)] +enum ImplodeError { + #[error("End of stream")] + EndOfStream, + + #[error("Too many codeword lengths")] + TooManyCodewordLengths, + + #[error("Too few codeword lengths")] + TooFewCodewordLengths, + + #[error("Higher count than available codewords")] + HigherCountThanAvailableCodewords, + + #[error("Not all codewords used")] + NotAllCodewordsUsed, +} + +/// Initialize the Huffman decoder d with num_lens codeword lengths read from is. +/// Returns false if the input is invalid. +fn read_huffman_code(is: &mut BitStream, num_lens: usize, d: &mut HuffmanDecoder) -> core::result::Result<(), ImplodeError> { + let mut lens = [0; 256]; + let mut len_count = [0; 17]; + // assert!(num_lens <= sizeof(lens) / sizeof(lens[0])); + + // Number of bytes representing the Huffman code. + let byte = lsb(is.bits(), 8); + let num_bytes = (byte + 1) as usize; + if !is.advance(8) { + return Err(ImplodeError::EndOfStream); + + } + + let mut codeword_idx = 0; + for _byte_idx in 0..num_bytes { + let byte = lsb(is.bits(), 8); + if !is.advance(8) { + return Err(ImplodeError::EndOfStream); + } + + let codeword_len = (byte & 0xf) + 1; /* Low four bits plus one. */ + let run_length = (byte >> 4) + 1; /* High four bits plus one. */ + + assert!(codeword_len >= 1 && codeword_len <= 16); + //assert!(codeword_len < sizeof(len_count) / sizeof(len_count[0])); + len_count[codeword_len as usize] += run_length; + + if (codeword_idx + run_length) as usize > num_lens { + return Err(ImplodeError::TooManyCodewordLengths); + } + for _ in 0..run_length { + assert!((codeword_idx as usize) < num_lens); + lens[codeword_idx as usize] = codeword_len as u8; + codeword_idx += 1; + } + } + + assert!(codeword_idx as usize <= num_lens); + if (codeword_idx as usize) < num_lens { + return Err(ImplodeError::TooFewCodewordLengths); + } + + // Check that the Huffman tree is full. + let mut avail_codewords = 1; + for i in 1..=16 { + assert!(avail_codewords >= 0); + avail_codewords *= 2; + avail_codewords -= len_count[i] as i32; + if avail_codewords < 0 { + return Err(ImplodeError::HigherCountThanAvailableCodewords); + } + } + if avail_codewords != 0 { + // Not all codewords were used. + return Err(ImplodeError::NotAllCodewordsUsed); + } + + let ok = d.init(&lens, num_lens); + assert!(ok, "The checks above mean the tree should be valid."); + Ok(()) +} + + +fn hwexplode( + src: &[u8], + src_len: usize, + uncomp_len: usize, + large_wnd: bool, + lit_tree: bool, + pk101_bug_compat: bool, + src_used: &mut usize, + dst: &mut VecDeque, +) -> core::result::Result<(), ImplodeError> { + let mut is = BitStream::new(src, src_len); + let mut lit_decoder = HuffmanDecoder::default(); + let mut len_decoder = HuffmanDecoder::default(); + let mut dist_decoder = HuffmanDecoder::default(); + if lit_tree { + read_huffman_code(&mut is, 256, &mut lit_decoder)?; + } + read_huffman_code(&mut is, 64, &mut len_decoder)?; + read_huffman_code(&mut is, 64, &mut dist_decoder)?; + let min_len = if pk101_bug_compat { + if large_wnd { + 3 + } else { + 2 + } + } else { + if lit_tree { + 3 + } else { + 2 + } + }; + + while dst.len() < uncomp_len { + let mut bits = is.bits(); + if lsb(bits, 1) == 0x1 { + // Literal. + bits >>= 1; + let sym; + let mut used = 0; + if lit_tree { + sym = lit_decoder.huffman_decode(!bits as u16, &mut used); + assert!(sym >= 0, "huffman decode failed"); + if !is.advance(1 + used) { + return Err(ImplodeError::EndOfStream); + } + } else { + sym = lsb(bits, 8) as i32; + if !is.advance(1 + 8) { + return Err(ImplodeError::EndOfStream); + } + } + assert!(sym >= 0 && sym <= u8::MAX as i32); + dst.push_back(sym as u8); + continue; + } + // Backref. + assert!(lsb(bits, 1) == 0x0); + let mut used_tot = 1; + bits >>= 1; + + // Read the low dist bits. + let mut dist; + if large_wnd { + dist = lsb(bits, 7) as usize; + bits >>= 7; + used_tot += 7; + } else { + dist = lsb(bits, 6) as usize; + bits >>= 6; + used_tot += 6; + } + + // Read the Huffman-encoded high dist bits. + let mut used = 0; + let sym = dist_decoder.huffman_decode(!bits as u16, &mut used); + assert!(sym >= 0, "huffman decode failed"); + used_tot += used; + bits >>= used; + dist |= (sym as usize) << if large_wnd { 7 } else { 6 }; + dist += 1; + + // Read the Huffman-encoded len. + let sym = len_decoder.huffman_decode(!bits as u16, &mut used); + assert!(sym >= 0, "huffman decode failed"); + used_tot += used; + bits >>= used; + let mut len = (sym + min_len) as usize; + + if sym == 63 { + // Read an extra len byte. + len += lsb(bits, 8) as usize; + used_tot += 8; + // bits >>= 8; + } + + assert!((used_tot as usize) <= ISTREAM_MIN_BITS); + if !is.advance(used_tot) { + return Err(ImplodeError::EndOfStream); + } + // let len = len.min(uncomp_len - dst.len()); + + if len <= uncomp_len - dst.len() && dist <= dst.len() { + // Enough room and no implicit zeros; chunked copy. + lz77_output_backref(dst, dist, len); + } else { + // Copy, handling overlap and implicit zeros. + for _i in 0..len { + if dist > dst.len() { + dst.push_back(0); + continue; + } + dst.push_back(dst[dst.len() - dist]); + } + } + } + + *src_used = is.bytes_read(); + Ok(()) +} + +#[derive(Debug)] +pub struct ImplodeDecoder { + compressed_reader: R, + uncompressed_size: u64, + stream_read: bool, + large_wnd: bool, + lit_tree: bool, + stream: VecDeque, +} + +impl ImplodeDecoder { + pub fn new(inner: R, uncompressed_size: u64, flags: u16) -> Self { + let large_wnd = (flags & 2) != 0; + let lit_tree = (flags & 4) != 0; + ImplodeDecoder { + compressed_reader: inner, + uncompressed_size, + stream_read: false, + large_wnd, + lit_tree, + stream: VecDeque::new(), + } + } + + pub fn finish(mut self) -> Result> { + copy(&mut self.compressed_reader, &mut self.stream)?; + Ok(self.stream) + } +} + +impl Read for ImplodeDecoder { + fn read(&mut self, buf: &mut [u8]) -> Result { + if !self.stream_read { + self.stream_read = true; + let mut compressed_bytes = Vec::new(); + if let Err(err) = self.compressed_reader.read_to_end(&mut compressed_bytes) { + return Err(err.into()); + } + let mut src_used = 0; + if let Err(err) = hwexplode( + &compressed_bytes, + compressed_bytes.len(), + self.uncompressed_size as usize, + self.large_wnd, + self.lit_tree, + false, + &mut src_used, + &mut self.stream, + ) { + return Err(io::Error::new(io::ErrorKind::InvalidData, err.to_string())); + } + } + let bytes_read = self.stream.len().min(buf.len()); + buf[..bytes_read].copy_from_slice(&self.stream.drain(..bytes_read).collect::>()); + Ok(bytes_read) + } +} + +#[cfg(test)] +mod tests { + use std::collections::VecDeque; + + use super::hwexplode; + + const HAMLET_256: [u8; 249] = [ + 0x0d, 0x02, 0x01, 0x12, 0x23, 0x14, 0x15, 0x36, 0x37, 0x68, 0x89, 0x9a, 0xdb, 0x3c, 0x05, + 0x06, 0x12, 0x13, 0x44, 0xc5, 0xf6, 0x96, 0xf7, 0xdf, 0xef, 0xfe, 0xdd, 0x50, 0x21, 0x54, + 0xb9, 0x6f, 0xd5, 0x96, 0x1d, 0x4b, 0x17, 0xe4, 0xd1, 0xba, 0x74, 0xcb, 0xba, 0x15, 0x5b, + 0x56, 0xee, 0x59, 0x90, 0x45, 0x85, 0xbe, 0x7d, 0xbb, 0x16, 0xe4, 0x5b, 0xb3, 0x20, 0x91, + 0x86, 0x6d, 0xcb, 0xb6, 0x2c, 0x5d, 0x96, 0x20, 0xc5, 0xe6, 0x05, 0x79, 0x35, 0x2d, 0x5b, + 0xb6, 0x69, 0x9c, 0x37, 0xc8, 0xa9, 0x68, 0xc3, 0xae, 0x2d, 0x3b, 0x17, 0x6e, 0xd9, 0xb0, + 0x72, 0xcb, 0xe8, 0xaf, 0xe0, 0x4d, 0x15, 0x6d, 0xda, 0xb9, 0x20, 0xcb, 0xbc, 0x37, 0xe4, + 0x37, 0xfb, 0x56, 0x2e, 0x48, 0xba, 0x68, 0xcb, 0x82, 0xac, 0x3b, 0xb7, 0x8c, 0xff, 0x0c, + 0xeb, 0x36, 0xef, 0x5b, 0xb7, 0x65, 0x8c, 0xe7, 0x1d, 0xea, 0xf5, 0xbe, 0xc2, 0xb7, 0x9b, + 0xee, 0x5e, 0xd5, 0x6d, 0x9a, 0x74, 0x4d, 0x26, 0x59, 0xd3, 0x0d, 0x63, 0xbc, 0xe7, 0x74, + 0x3f, 0x19, 0x63, 0xdd, 0xf6, 0xed, 0x1c, 0xa0, 0xfb, 0x0d, 0xf7, 0xfd, 0x6f, 0x38, 0xd9, + 0x9a, 0xee, 0x9c, 0xfe, 0xa1, 0x3e, 0xef, 0x40, 0x6b, 0x36, 0xe9, 0xeb, 0x7c, 0x83, 0x74, + 0xfb, 0x16, 0xe4, 0x98, 0xf1, 0xd1, 0x7e, 0xd4, 0xcb, 0x7f, 0xa3, 0x41, 0xde, 0x6c, 0xe6, + 0xdb, 0xf5, 0xe2, 0x5f, 0xd9, 0x0a, 0x79, 0xcb, 0x4d, 0x13, 0x54, 0xa7, 0x61, 0x57, 0xf8, + 0x2b, 0x5d, 0xb5, 0xef, 0xb9, 0x6f, 0xcb, 0xda, 0x49, 0xd6, 0x2e, 0x41, 0x82, 0xcc, 0xfa, + 0xb6, 0x2e, 0xc8, 0xb6, 0x61, 0xf3, 0xe8, 0x3f, 0x1c, 0xe2, 0x9d, 0x06, 0xa9, 0x9f, 0x4d, + 0x6b, 0xc7, 0xe8, 0x19, 0xfb, 0x9d, 0xea, 0x63, 0xbb, + ]; + + #[test] + fn test_explode_hamlet_256() { + let mut src_used = HAMLET_256.len(); + let mut dst = VecDeque::new(); + hwexplode( + &HAMLET_256, + HAMLET_256.len(), + 256, + false, + false, + false, + &mut src_used, + &mut dst, + ).unwrap(); + assert_eq!(dst.len(), 256); + } +} diff --git a/src/legacy/lz77.rs b/src/legacy/lz77.rs new file mode 100644 index 000000000..e52cc1068 --- /dev/null +++ b/src/legacy/lz77.rs @@ -0,0 +1,10 @@ +use std::collections::VecDeque; + +/// Output the (dist,len) back reference at dst_pos in dst. +pub fn lz77_output_backref(dst: &mut VecDeque, dist: usize, len: usize) { + // assert!(dist <= dst_pos, "cannot reference before beginning of dst"); + + for _ in 0..len { + dst.push_back(dst[dst.len() - dist]); + } +} diff --git a/src/legacy/mod.rs b/src/legacy/mod.rs new file mode 100644 index 000000000..51bcf8ffa --- /dev/null +++ b/src/legacy/mod.rs @@ -0,0 +1,9 @@ +mod bitstream; +mod huffman; +mod lz77; +pub mod shrink; +pub use shrink::*; +pub mod reduce; +pub use reduce::*; +pub mod implode; +pub use implode::*; diff --git a/src/legacy/reduce.rs b/src/legacy/reduce.rs new file mode 100644 index 000000000..29f1068ae --- /dev/null +++ b/src/legacy/reduce.rs @@ -0,0 +1,486 @@ +use std::collections::VecDeque; +use std::io::{self, copy, Read, Result}; + +use thiserror::Error; + +use crate::legacy::lz77::lz77_output_backref; + +use super::bitstream::{lsb, BitStream}; + +#[derive(Error, Debug)] +enum ReduceError { + #[error("Invalid follower set")] + InvalidFollowerSet, + #[error("Error reading next byte")] + ErrorReadingNextByte, +} + +/// Number of bits used to represent indices in a follower set of size n. +fn follower_idx_bw(n: u8) -> u8 { + assert!(n <= 32); + + if n > 16 { + return 5; + } + if n > 8 { + return 4; + } + if n > 4 { + return 3; + } + if n > 2 { + return 2; + } + if n > 0 { + return 1; + } + return 0; +} + +#[derive(Default, Clone, Copy)] +struct FollowerSet { + size: u8, + idx_bw: u8, + followers: [u8; 32], +} + +/// Read the follower sets from is into fsets. Returns true on success. +fn read_follower_sets(is: &mut BitStream, fsets: &mut [FollowerSet]) -> bool { + for i in (0..=255 as usize).rev() { + let n = lsb(is.bits(), 6) as u8; + if n > 32 { + return false; + } + if !is.advance(6) { + return false; + } + fsets[i].size = n; + fsets[i].idx_bw = follower_idx_bw(n); + + for j in 0..fsets[i].size as usize { + fsets[i].followers[j] = is.bits() as u8; + if !is.advance(8) { + return false; + } + } + } + + return true; +} + +/// Read the next byte from is, decoded based on prev_byte and the follower sets. +/// The byte is returned in *out_byte. The function returns true on success, +/// and false on bad data or end of input. +fn read_next_byte( + is: &mut BitStream, + prev_byte: u8, + fsets: &mut [FollowerSet], + out_byte: &mut u8, +) -> bool { + let bits = is.bits(); + + if fsets[prev_byte as usize].size == 0 { + // No followers; read a literal byte. + *out_byte = bits as u8; + return is.advance(8); + } + + if lsb(bits, 1) == 1 { + // Don't use the follower set; read a literal byte. + *out_byte = (bits >> 1) as u8; + return is.advance(1 + 8); + } + + // The bits represent the index of a follower byte. + let idx_bw = fsets[prev_byte as usize].idx_bw; + let follower_idx = lsb(bits >> 1, idx_bw) as usize; + if follower_idx >= fsets[prev_byte as usize].size as usize { + return false; + } + *out_byte = fsets[prev_byte as usize].followers[follower_idx]; + return is.advance(1 + idx_bw); +} + +fn max_len(comp_factor: u8) -> usize { + let v_len_bits = (8 - comp_factor) as usize; + + assert!(comp_factor >= 1 && comp_factor <= 4); + + // Bits in V + extra len byte + implicit 3. + ((1 << v_len_bits) - 1) + 255 + 3 +} + +fn max_dist(comp_factor: u8) -> usize { + let v_dist_bits = comp_factor as usize; + + assert!(comp_factor >= 1 && comp_factor <= 4); + + // Bits in V * 256 + W byte + implicit 1. */ + ((1 << v_dist_bits) - 1) * 256 + 255 + 1 +} + +const DLE_BYTE: u8 = 144; + +fn hwexpand( + src: &[u8], + src_len: usize, + uncomp_len: usize, + comp_factor: u8, + src_used: &mut usize, + dst: &mut VecDeque, +) -> core::result::Result<(), ReduceError> { + let mut fsets = [FollowerSet::default(); 256]; + assert!(comp_factor >= 1 && comp_factor <= 4); + + let mut is = BitStream::new(src, src_len); + if !read_follower_sets(&mut is, &mut fsets) { + return Err(ReduceError::InvalidFollowerSet); + } + + // Number of bits in V used for backref length. + let v_len_bits = 8 - comp_factor; + + let mut curr_byte = 0; // The first "previous byte" is implicitly zero. + + while dst.len() < uncomp_len { + // Read a literal byte or DLE marker. + if !read_next_byte(&mut is, curr_byte, &mut fsets, &mut curr_byte) { + return Err(ReduceError::ErrorReadingNextByte); + } + if curr_byte != DLE_BYTE { + // Output a literal byte. + dst.push_back(curr_byte); + continue; + } + + // Read the V byte which determines the length. + if !read_next_byte(&mut is, curr_byte, &mut fsets, &mut curr_byte) { + return Err(ReduceError::ErrorReadingNextByte); + } + if curr_byte == 0 { + // Output a literal DLE byte. + dst.push_back(DLE_BYTE); + continue; + } + let v = curr_byte; + let mut len = lsb(v as u64, v_len_bits) as usize; + if len == (1 << v_len_bits) - 1 { + // Read an extra length byte. + if !read_next_byte(&mut is, curr_byte, &mut fsets, &mut curr_byte) { + return Err(ReduceError::ErrorReadingNextByte); + } + len += curr_byte as usize; + } + len += 3; + + // Read the W byte, which together with V gives the distance. + if !read_next_byte(&mut is, curr_byte, &mut fsets, &mut curr_byte) { + return Err(ReduceError::ErrorReadingNextByte); + } + let dist = ((v as usize) >> v_len_bits) * 256 + curr_byte as usize + 1; + + assert!(len <= max_len(comp_factor)); + assert!(dist as usize <= max_dist(comp_factor)); + + // Output the back reference. + if len <= uncomp_len - dst.len() && dist as usize <= dst.len() { + // Enough room and no implicit zeros; chunked copy. + lz77_output_backref(dst, dist as usize, len); + } else { + // Copy, handling overlap and implicit zeros. + for _i in 0..len { + if dist as usize > dst.len() { + dst.push_back(0); + continue; + } + dst.push_back(dst[dst.len() - dist as usize]); + } + } + } + + *src_used = is.bytes_read(); + + Ok(()) +} + +#[derive(Debug)] +pub struct ReduceDecoder { + compressed_reader: R, + uncompressed_size: u64, + stream_read: bool, + comp_factor: u8, + stream: VecDeque, +} + +impl ReduceDecoder { + pub fn new(inner: R, uncompressed_size: u64, comp_factor: u8) -> Self { + ReduceDecoder { + compressed_reader: inner, + uncompressed_size, + stream_read: false, + comp_factor, + stream: VecDeque::new(), + } + } + + pub fn finish(mut self) -> Result> { + copy(&mut self.compressed_reader, &mut self.stream)?; + Ok(self.stream) + } +} + +impl Read for ReduceDecoder { + fn read(&mut self, buf: &mut [u8]) -> Result { + if !self.stream_read { + self.stream_read = true; + let mut compressed_bytes = Vec::new(); + if let Err(err) = self.compressed_reader.read_to_end(&mut compressed_bytes) { + return Err(err.into()); + } + let mut src_used = 0; + if let Err(err) = hwexpand( + &compressed_bytes, + compressed_bytes.len(), + self.uncompressed_size as usize, + self.comp_factor, + &mut src_used, + &mut self.stream, + ) { + return Err(io::Error::new(io::ErrorKind::InvalidData, err.to_string())); + } + } + let bytes_read = self.stream.len().min(buf.len()); + buf[..bytes_read].copy_from_slice(&self.stream.drain(..bytes_read).collect::>()); + Ok(bytes_read) + } +} + +#[cfg(test)] +mod tests { + use std::collections::VecDeque; + + use super::hwexpand; + + const HAMLET_2048: [u8; 1285] = [ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x58, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x01, 0x0f, 0x06, 0x11, + 0x31, 0x21, 0x1f, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x04, 0x99, 0x00, 0x00, 0x00, 0x00, 0x20, 0x80, 0xbc, 0x01, 0xc4, 0x5d, 0x1a, 0x5a, 0x98, + 0x50, 0x06, 0x49, 0xcc, 0xb9, 0xd1, 0x91, 0x11, 0x65, 0x20, 0x68, 0x73, 0x04, 0x08, 0x24, + 0x5d, 0x19, 0x51, 0x06, 0x02, 0x99, 0x06, 0x08, 0x6c, 0x61, 0x84, 0x9c, 0x5b, 0x1d, 0x1d, + 0x02, 0xf9, 0x76, 0x46, 0x36, 0x46, 0x57, 0x96, 0x26, 0x40, 0x86, 0x11, 0x65, 0x61, 0x90, + 0x6c, 0x00, 0x40, 0xb8, 0xd1, 0xcd, 0xd5, 0x09, 0x61, 0x65, 0x02, 0x64, 0x9d, 0xf0, 0x06, + 0x42, 0x40, 0xca, 0xb9, 0x81, 0x10, 0x20, 0x90, 0x69, 0x65, 0x04, 0x24, 0xdd, 0x1b, 0x9a, + 0x50, 0xa6, 0x4e, 0xc8, 0xd1, 0xb9, 0xcd, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, + 0x00, 0xe9, 0x22, 0x50, 0x11, 0x11, 0x20, 0x68, 0x52, 0x49, 0x80, 0x40, 0x15, 0x04, 0x00, + 0x80, 0xf0, 0x26, 0x04, 0x08, 0x61, 0x41, 0x02, 0x24, 0x08, 0x00, 0x08, 0x4f, 0x45, 0x00, + 0x20, 0x48, 0x39, 0x09, 0x61, 0x45, 0x02, 0x1a, 0x15, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, + 0x00, 0x02, 0x09, 0x00, 0x00, 0x00, 0x00, 0x02, 0xa4, 0x1b, 0x00, 0x00, 0x80, 0x00, 0xd2, + 0x00, 0x08, 0x20, 0x90, 0x80, 0xa0, 0x22, 0x0e, 0x00, 0x01, 0x24, 0x00, 0x00, 0x00, 0x00, + 0x20, 0x77, 0x61, 0x53, 0x6f, 0x50, 0x45, 0x90, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x28, 0x00, 0x80, 0x00, 0x09, 0x05, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xef, 0xbb, 0xbf, 0x0d, 0x28, 0xf7, 0xad, 0x5a, 0xd9, + 0x31, 0xe9, 0x51, 0x1d, 0xc1, 0x62, 0xe8, 0x59, 0x10, 0x2d, 0xf4, 0xf6, 0xed, 0x1a, 0x88, + 0x35, 0x33, 0xd2, 0xb0, 0x6d, 0xd9, 0x90, 0x2e, 0x0b, 0xc5, 0xe6, 0xf1, 0x2a, 0x2d, 0x9b, + 0xa7, 0x0d, 0xdb, 0x16, 0x84, 0xd0, 0xb8, 0x56, 0x76, 0x2e, 0xdc, 0xb2, 0x61, 0xc0, 0x06, + 0x36, 0x90, 0x4a, 0xd3, 0x88, 0x65, 0xf0, 0x97, 0x34, 0xa2, 0x19, 0x50, 0x3a, 0xea, 0x75, + 0x30, 0xc0, 0x27, 0x8c, 0xf3, 0x14, 0x03, 0x0c, 0xee, 0xa8, 0xe0, 0x69, 0x00, 0xef, 0xa8, + 0xea, 0xe6, 0x42, 0x32, 0x10, 0xdd, 0x30, 0xe1, 0x1c, 0x84, 0xb6, 0x81, 0x6d, 0xdf, 0xce, + 0x51, 0x66, 0x2a, 0xb9, 0x48, 0x67, 0x01, 0x1f, 0x24, 0x20, 0xbd, 0xfb, 0x86, 0x6c, 0xc9, + 0x20, 0x52, 0x37, 0x09, 0x72, 0x0c, 0x30, 0x12, 0x46, 0x03, 0x48, 0x0c, 0x22, 0xd9, 0xe8, + 0x33, 0xca, 0x06, 0xca, 0xe1, 0x1c, 0xcb, 0xf9, 0x98, 0xa6, 0x7d, 0xd3, 0x39, 0x00, 0x91, + 0xbf, 0x2d, 0x6b, 0x87, 0xba, 0x10, 0x64, 0xd6, 0x1b, 0x83, 0x6c, 0x73, 0x1e, 0xc7, 0x18, + 0x6e, 0x1e, 0xd3, 0x94, 0x85, 0x67, 0xd3, 0xda, 0xe1, 0x69, 0x92, 0xbc, 0xf3, 0x3c, 0x0c, + 0x2a, 0x87, 0x2d, 0x90, 0xb0, 0x9a, 0xa6, 0x0d, 0xac, 0x93, 0x19, 0x07, 0x7a, 0xe9, 0xa0, + 0x6d, 0x50, 0x20, 0x24, 0x03, 0x74, 0x30, 0x4d, 0x3b, 0xb6, 0x8c, 0x00, 0x34, 0x6e, 0x98, + 0x6d, 0x9d, 0x8d, 0x04, 0x8f, 0x74, 0x9c, 0xc6, 0x0d, 0x70, 0x22, 0xe1, 0x0d, 0x32, 0x65, + 0x9b, 0x16, 0x12, 0xf4, 0xe9, 0x04, 0x40, 0x97, 0x67, 0xac, 0xd0, 0x72, 0xf9, 0x86, 0x67, + 0x5d, 0x08, 0x32, 0xc9, 0xcc, 0x79, 0x32, 0x88, 0x00, 0xee, 0x26, 0x56, 0xb6, 0x6f, 0xc7, + 0x86, 0x85, 0xb4, 0x08, 0xc8, 0x13, 0x1f, 0x0d, 0x50, 0x03, 0x24, 0x8b, 0xa0, 0x22, 0xb0, + 0x39, 0x48, 0x34, 0xda, 0xe1, 0x74, 0xdf, 0x82, 0x1c, 0xb3, 0xc7, 0xae, 0x41, 0x96, 0x40, + 0xcb, 0xa6, 0x77, 0x21, 0x5b, 0xac, 0x8c, 0x91, 0xd2, 0x72, 0xf3, 0xe0, 0x13, 0x6b, 0x79, + 0x72, 0x03, 0x00, 0x18, 0xe4, 0x02, 0x2e, 0x31, 0x9a, 0x01, 0x9a, 0x66, 0x1a, 0x08, 0x6f, + 0x05, 0x59, 0x56, 0xec, 0xdb, 0xb7, 0x6b, 0x2e, 0x21, 0xad, 0x18, 0xb2, 0x44, 0x72, 0x9a, + 0xb2, 0xa1, 0x8e, 0x29, 0xe4, 0x21, 0x4d, 0x3b, 0xa8, 0x8e, 0xfc, 0x86, 0x3a, 0xb2, 0x41, + 0xbe, 0xd4, 0xb2, 0x6c, 0x18, 0x66, 0x3b, 0x11, 0x42, 0x1d, 0x3a, 0xd1, 0x8e, 0x6d, 0xc5, + 0x90, 0xc6, 0xe4, 0xe4, 0xe0, 0x80, 0xdc, 0x82, 0x3c, 0x12, 0x34, 0x12, 0x53, 0x23, 0x43, + 0xd3, 0xd5, 0x40, 0x26, 0x4c, 0xad, 0x0a, 0x97, 0x4c, 0x40, 0xae, 0x03, 0x95, 0x85, 0x4b, + 0x17, 0xf2, 0xc0, 0xca, 0x4c, 0x18, 0x16, 0xca, 0xc0, 0xc4, 0xe4, 0x40, 0x2a, 0x52, 0x26, + 0x48, 0x0e, 0x7b, 0xb6, 0xac, 0x0e, 0xda, 0x8d, 0xb2, 0x4d, 0x63, 0xb4, 0x90, 0xda, 0x35, + 0x04, 0x18, 0x76, 0x4c, 0x90, 0xce, 0x39, 0x9d, 0x96, 0x11, 0x99, 0x8c, 0xa0, 0x3a, 0xac, + 0xa2, 0x51, 0x0b, 0x0e, 0xa4, 0xfa, 0xa9, 0x40, 0x10, 0xa2, 0x1a, 0x24, 0x05, 0x3e, 0x19, + 0x81, 0xa4, 0x8a, 0x34, 0x69, 0x0a, 0x04, 0xa5, 0x3e, 0x29, 0x15, 0x1d, 0x12, 0x8f, 0xaa, + 0x58, 0xa4, 0x45, 0x3c, 0x02, 0xd1, 0x42, 0x4f, 0x4f, 0x4b, 0x46, 0x1a, 0xd4, 0xc4, 0xb4, + 0x28, 0x15, 0xaa, 0x40, 0x48, 0x82, 0x87, 0x2c, 0xa2, 0x4b, 0x87, 0x78, 0x74, 0x02, 0x1b, + 0x5e, 0x0e, 0xe1, 0x04, 0x0d, 0x25, 0x8f, 0x44, 0xd3, 0x86, 0xb1, 0x1b, 0xbb, 0x50, 0xd9, + 0x30, 0x42, 0x8a, 0x0f, 0xaa, 0x48, 0x06, 0x49, 0x45, 0x8f, 0x8a, 0x12, 0xcd, 0x82, 0x04, + 0x35, 0xc8, 0x03, 0x4d, 0x2c, 0xa0, 0xd4, 0x24, 0xa7, 0x43, 0x8b, 0x42, 0x02, 0x1f, 0x91, + 0x6e, 0x0a, 0x92, 0xba, 0xc4, 0x8a, 0xa6, 0x06, 0xf8, 0x83, 0x30, 0xc3, 0x83, 0x91, 0xa1, + 0x6f, 0x52, 0x50, 0xad, 0x12, 0x6e, 0x87, 0xc4, 0xa4, 0x06, 0x4e, 0x8d, 0x2d, 0x23, 0x7b, + 0x92, 0x0b, 0x9a, 0xed, 0xdc, 0x34, 0x08, 0xd0, 0x85, 0x41, 0x20, 0x8e, 0xd4, 0x0c, 0x6c, + 0x63, 0x05, 0x31, 0x24, 0x8e, 0x1d, 0x1a, 0x66, 0x66, 0x43, 0x97, 0x90, 0x14, 0x03, 0x99, + 0x41, 0x46, 0xee, 0xdb, 0xb7, 0x6d, 0xa0, 0xf0, 0x9c, 0xb0, 0x0c, 0x6b, 0xf2, 0x42, 0x1e, + 0x98, 0xe1, 0x81, 0x4c, 0x12, 0x24, 0xa5, 0xa4, 0x21, 0x08, 0xbe, 0x65, 0xfb, 0x26, 0x37, + 0x8a, 0xc3, 0x1c, 0xa2, 0x7d, 0x23, 0x14, 0x81, 0xcb, 0x4a, 0x52, 0x49, 0xd0, 0x21, 0x24, + 0xd5, 0xb5, 0x02, 0x3a, 0xdb, 0xd0, 0x2b, 0x39, 0x6c, 0xfb, 0x66, 0xa0, 0x4c, 0x2f, 0xe4, + 0x1a, 0x5e, 0x48, 0x0a, 0x85, 0x4c, 0xc0, 0x0d, 0x39, 0xa1, 0x1b, 0x52, 0x28, 0xec, 0xac, + 0xf0, 0x13, 0x52, 0x06, 0xa4, 0x42, 0x0a, 0xc1, 0x14, 0x24, 0x17, 0x7c, 0x04, 0x81, 0x44, + 0x23, 0x9b, 0x29, 0x07, 0x20, 0x2c, 0x0f, 0x42, 0x90, 0xd0, 0xee, 0x06, 0x87, 0x96, 0x42, + 0x8a, 0x42, 0x4a, 0x2b, 0x64, 0x63, 0x12, 0x52, 0x14, 0x84, 0x9c, 0x71, 0x0a, 0x29, 0x11, + 0x27, 0x94, 0x68, 0x84, 0x43, 0xd3, 0x00, 0xa3, 0xd4, 0x88, 0x96, 0x71, 0x9b, 0x20, 0x82, + 0x43, 0xb6, 0x58, 0x85, 0xec, 0x02, 0x33, 0xc1, 0x8a, 0x15, 0x42, 0x71, 0x69, 0x85, 0x3c, + 0xfc, 0x42, 0x1e, 0xa9, 0x86, 0xbc, 0xf1, 0x30, 0xe6, 0x75, 0xe5, 0x8e, 0x79, 0xde, 0x30, + 0x24, 0x13, 0x4b, 0x6c, 0x42, 0x0e, 0x3b, 0x96, 0xa8, 0xdc, 0xb0, 0x6d, 0x6a, 0x1a, 0x81, + 0x65, 0x3a, 0xf7, 0x4d, 0x87, 0x4d, 0x21, 0x87, 0xc5, 0x83, 0x6c, 0x13, 0x28, 0x67, 0x20, + 0x8a, 0x6d, 0xe3, 0xc1, 0xfb, 0x50, 0x26, 0xab, 0x9c, 0x54, 0x75, 0x8a, 0x85, 0x4b, 0x0c, + 0x62, 0x87, 0x7c, 0xb0, 0xc1, 0x62, 0xb2, 0xd1, 0x90, 0x45, 0xc4, 0x15, 0xa2, 0xcc, 0x0f, + 0xa4, 0x62, 0x1f, 0x21, 0x31, 0x45, 0x15, 0x72, 0x59, 0xba, 0x6c, 0xc4, 0x98, 0xb5, 0x34, + 0x10, 0x15, 0xba, 0x34, 0x1b, 0x16, 0x72, 0x58, 0x4f, 0x17, 0x79, 0x54, 0x04, 0x5c, 0xa5, + 0x59, 0x2c, 0x66, 0x54, 0xdd, 0xb2, 0x65, 0x84, 0x0a, 0xaf, 0xda, 0x28, 0xf6, 0x98, 0x85, + 0x6e, 0xf2, 0x2e, 0x08, 0xa8, 0x59, 0xc8, 0x72, 0x13, 0x86, 0xb2, 0x69, 0x9d, 0x69, 0x74, + 0x11, 0x9f, 0x98, 0x3e, 0x39, 0x85, 0x74, 0x4e, 0xa6, 0x6f, 0x48, 0x86, 0x43, 0x10, 0x72, + 0xd4, 0x0d, 0xa4, 0xd1, 0xba, 0x48, 0x26, 0x8b, 0x60, 0xd1, 0x29, 0x16, 0xe8, 0x4d, 0x30, + 0x2a, 0x1d, 0x72, 0xcd, 0xa4, 0x8b, 0x7c, 0x82, 0x42, 0x32, 0xd3, 0xa4, 0x20, 0x16, 0x12, + 0xb1, 0xee, 0x59, 0xb4, 0x90, 0xa3, 0x26, 0x20, 0x2f, 0x7c, 0x20, 0x21, 0x25, 0x95, 0x9f, + 0x58, 0x68, 0x24, 0xe7, 0x65, 0x34, 0x0d, 0x7b, 0xc2, 0xb9, 0xbe, 0x2e, 0xd2, 0xe8, 0x49, + 0x0a, 0x3b, 0x29, 0xe5, 0x14, 0xe4, 0x0c, 0x18, 0x27, 0x00, + ]; + + #[test] + fn test_expand_hamlet2048() { + let mut dst = VecDeque::new(); + let mut src_used = 0; + hwexpand( + &HAMLET_2048, + HAMLET_2048.len(), + 2048, + 4, + &mut src_used, + &mut dst, + ).unwrap(); + assert_eq!(dst.len(), 2048); + } + + /* + Put some text first to make PKZIP actually use Reduce compression. + Target the code path which copies a zero when dist > current position. + + $ curl -O http://cd.textfiles.com/originalsw/25/pkz092.exe + $ dosbox -c "mount c ." -c "c:" -c "pkz092" -c "exit" + $ dd if=hamlet.txt bs=1 count=2048 > a + $ dd if=/dev/zero bs=1 count=1024 >> a + $ dosbox -c "mount c ." -c "c:" -c "pkzip -ea4 a.zip a" -c "exit" + $ xxd -i -s 31 -l $(expr $(find A.ZIP -printf %s) - 100) A.ZIP + */ + const ZEROS_REDUCED: [u8; 1297] = [ + 0xc2, 0x3f, 0x24, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x58, 0x07, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x01, 0x0f, + 0x06, 0x11, 0x31, 0x21, 0x1f, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x04, 0x99, 0x00, 0x00, 0x00, 0x00, 0x20, 0x80, 0xbc, 0x01, 0xc4, 0x5d, 0x1a, + 0x5a, 0x98, 0x50, 0x06, 0x49, 0xcc, 0xb9, 0xd1, 0x91, 0x11, 0x65, 0x20, 0x68, 0x73, 0x04, + 0x08, 0x24, 0x5d, 0x19, 0x51, 0x06, 0x02, 0x99, 0x06, 0x08, 0x6c, 0x61, 0x84, 0x9c, 0x5b, + 0x1d, 0x1d, 0x02, 0xf9, 0x76, 0x46, 0x36, 0x46, 0x57, 0x96, 0x26, 0x40, 0x86, 0x11, 0x65, + 0x61, 0x90, 0x6c, 0x00, 0x40, 0xb8, 0xd1, 0xcd, 0xd5, 0x09, 0x61, 0x65, 0x02, 0x64, 0x9d, + 0xf0, 0x06, 0x42, 0x40, 0xca, 0xb9, 0x81, 0x10, 0x20, 0x90, 0x69, 0x65, 0x04, 0x24, 0xdd, + 0x1b, 0x9a, 0x50, 0xa6, 0x4e, 0xc8, 0xd1, 0xb9, 0xcd, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x80, 0x00, 0xe9, 0x22, 0x50, 0x11, 0x11, 0x20, 0x68, 0x52, 0x49, 0x80, 0x40, 0x15, + 0x04, 0x00, 0x80, 0xf0, 0x26, 0x04, 0x08, 0x61, 0x41, 0x02, 0x24, 0x08, 0x00, 0x08, 0x4f, + 0x45, 0x00, 0x20, 0x48, 0x39, 0x09, 0x61, 0x45, 0x02, 0x1a, 0x15, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x80, 0x00, 0x02, 0x09, 0x00, 0x00, 0x00, 0x00, 0x02, 0xa4, 0x1b, 0x00, 0x00, 0x80, + 0x00, 0xd2, 0x00, 0x08, 0x20, 0x90, 0x80, 0xa0, 0x22, 0x0e, 0x00, 0x01, 0x24, 0x00, 0x00, + 0x00, 0x00, 0x20, 0x77, 0x61, 0x53, 0x6f, 0x50, 0x45, 0x90, 0x70, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x28, 0x00, 0x80, 0x00, 0x09, + 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xef, 0xbb, 0xbf, 0x0d, 0x28, 0xf7, 0xad, + 0x5a, 0xd9, 0x31, 0xe9, 0x51, 0x1d, 0xc1, 0x62, 0xe8, 0x59, 0x10, 0x2d, 0xf4, 0xf6, 0xed, + 0x1a, 0x88, 0x35, 0x33, 0xd2, 0xb0, 0x6d, 0xd9, 0x90, 0x2e, 0x0b, 0xc5, 0xe6, 0xf1, 0x2a, + 0x2d, 0x9b, 0xa7, 0x0d, 0xdb, 0x16, 0x84, 0xd0, 0xb8, 0x56, 0x76, 0x2e, 0xdc, 0xb2, 0x61, + 0xc0, 0x06, 0x36, 0x90, 0x4a, 0xd3, 0x88, 0x65, 0xf0, 0x97, 0x34, 0xa2, 0x19, 0x50, 0x3a, + 0xea, 0x75, 0x30, 0xc0, 0x27, 0x8c, 0xf3, 0x14, 0x03, 0x0c, 0xee, 0xa8, 0xe0, 0x69, 0x00, + 0xef, 0xa8, 0xea, 0xe6, 0x42, 0x32, 0x10, 0xdd, 0x30, 0xe1, 0x1c, 0x84, 0xb6, 0x81, 0x6d, + 0xdf, 0xce, 0x51, 0x66, 0x2a, 0xb9, 0x48, 0x67, 0x01, 0x1f, 0x24, 0x20, 0xbd, 0xfb, 0x86, + 0x6c, 0xc9, 0x20, 0x52, 0x37, 0x09, 0x72, 0x0c, 0x30, 0x12, 0x46, 0x03, 0x48, 0x0c, 0x22, + 0xd9, 0xe8, 0x33, 0xca, 0x06, 0xca, 0xe1, 0x1c, 0xcb, 0xf9, 0x98, 0xa6, 0x7d, 0xd3, 0x39, + 0x00, 0x91, 0xbf, 0x2d, 0x6b, 0x87, 0xba, 0x10, 0x64, 0xd6, 0x1b, 0x83, 0x6c, 0x73, 0x1e, + 0xc7, 0x18, 0x6e, 0x1e, 0xd3, 0x94, 0x85, 0x67, 0xd3, 0xda, 0xe1, 0x69, 0x92, 0xbc, 0xf3, + 0x3c, 0x0c, 0x2a, 0x87, 0x2d, 0x90, 0xb0, 0x9a, 0xa6, 0x0d, 0xac, 0x93, 0x19, 0x07, 0x7a, + 0xe9, 0xa0, 0x6d, 0x50, 0x20, 0x24, 0x03, 0x74, 0x30, 0x4d, 0x3b, 0xb6, 0x8c, 0x00, 0x34, + 0x6e, 0x98, 0x6d, 0x9d, 0x8d, 0x04, 0x8f, 0x74, 0x9c, 0xc6, 0x0d, 0x70, 0x22, 0xe1, 0x0d, + 0x32, 0x65, 0x9b, 0x16, 0x12, 0xf4, 0xe9, 0x04, 0x40, 0x97, 0x67, 0xac, 0xd0, 0x72, 0xf9, + 0x86, 0x67, 0x5d, 0x08, 0x32, 0xc9, 0xcc, 0x79, 0x32, 0x88, 0x00, 0xee, 0x26, 0x56, 0xb6, + 0x6f, 0xc7, 0x86, 0x85, 0xb4, 0x08, 0xc8, 0x13, 0x1f, 0x0d, 0x50, 0x03, 0x24, 0x8b, 0xa0, + 0x22, 0xb0, 0x39, 0x48, 0x34, 0xda, 0xe1, 0x74, 0xdf, 0x82, 0x1c, 0xb3, 0xc7, 0xae, 0x41, + 0x96, 0x40, 0xcb, 0xa6, 0x77, 0x21, 0x5b, 0xac, 0x8c, 0x91, 0xd2, 0x72, 0xf3, 0xe0, 0x13, + 0x6b, 0x79, 0x72, 0x03, 0x00, 0x18, 0xe4, 0x02, 0x2e, 0x31, 0x9a, 0x01, 0x9a, 0x66, 0x1a, + 0x08, 0x6f, 0x05, 0x59, 0x56, 0xec, 0xdb, 0xb7, 0x6b, 0x2e, 0x21, 0xad, 0x18, 0xb2, 0x44, + 0x72, 0x9a, 0xb2, 0xa1, 0x8e, 0x29, 0xe4, 0x21, 0x4d, 0x3b, 0xa8, 0x8e, 0xfc, 0x86, 0x3a, + 0xb2, 0x41, 0xbe, 0xd4, 0xb2, 0x6c, 0x18, 0x66, 0x3b, 0x11, 0x42, 0x1d, 0x3a, 0xd1, 0x8e, + 0x6d, 0xc5, 0x90, 0xc6, 0xe4, 0xe4, 0xe0, 0x80, 0xdc, 0x82, 0x3c, 0x12, 0x34, 0x12, 0x53, + 0x23, 0x43, 0xd3, 0xd5, 0x40, 0x26, 0x4c, 0xad, 0x0a, 0x97, 0x4c, 0x40, 0xae, 0x03, 0x95, + 0x85, 0x4b, 0x17, 0xf2, 0xc0, 0xca, 0x4c, 0x18, 0x16, 0xca, 0xc0, 0xc4, 0xe4, 0x40, 0x2a, + 0x52, 0x26, 0x48, 0x0e, 0x7b, 0xb6, 0xac, 0x0e, 0xda, 0x8d, 0xb2, 0x4d, 0x63, 0xb4, 0x90, + 0xda, 0x35, 0x04, 0x18, 0x76, 0x4c, 0x90, 0xce, 0x39, 0x9d, 0x96, 0x11, 0x99, 0x8c, 0xa0, + 0x3a, 0xac, 0xa2, 0x51, 0x0b, 0x0e, 0xa4, 0xfa, 0xa9, 0x40, 0x10, 0xa2, 0x1a, 0x24, 0x05, + 0x3e, 0x19, 0x81, 0xa4, 0x8a, 0x34, 0x69, 0x0a, 0x04, 0xa5, 0x3e, 0x29, 0x15, 0x1d, 0x12, + 0x8f, 0xaa, 0x58, 0xa4, 0x45, 0x3c, 0x02, 0xd1, 0x42, 0x4f, 0x4f, 0x4b, 0x46, 0x1a, 0xd4, + 0xc4, 0xb4, 0x28, 0x15, 0xaa, 0x40, 0x48, 0x82, 0x87, 0x2c, 0xa2, 0x4b, 0x87, 0x78, 0x74, + 0x02, 0x1b, 0x5e, 0x0e, 0xe1, 0x04, 0x0d, 0x25, 0x8f, 0x44, 0xd3, 0x86, 0xb1, 0x1b, 0xbb, + 0x50, 0xd9, 0x30, 0x42, 0x8a, 0x0f, 0xaa, 0x48, 0x06, 0x49, 0x45, 0x8f, 0x8a, 0x12, 0xcd, + 0x82, 0x04, 0x35, 0xc8, 0x03, 0x4d, 0x2c, 0xa0, 0xd4, 0x24, 0xa7, 0x43, 0x8b, 0x42, 0x02, + 0x1f, 0x91, 0x6e, 0x0a, 0x92, 0xba, 0xc4, 0x8a, 0xa6, 0x06, 0xf8, 0x83, 0x30, 0xc3, 0x83, + 0x91, 0xa1, 0x6f, 0x52, 0x50, 0xad, 0x12, 0x6e, 0x87, 0xc4, 0xa4, 0x06, 0x4e, 0x8d, 0x2d, + 0x23, 0x7b, 0x92, 0x0b, 0x9a, 0xed, 0xdc, 0x34, 0x08, 0xd0, 0x85, 0x41, 0x20, 0x8e, 0xd4, + 0x0c, 0x6c, 0x63, 0x05, 0x31, 0x24, 0x8e, 0x1d, 0x1a, 0x66, 0x66, 0x43, 0x97, 0x90, 0x14, + 0x03, 0x99, 0x41, 0x46, 0xee, 0xdb, 0xb7, 0x6d, 0xa0, 0xf0, 0x9c, 0xb0, 0x0c, 0x6b, 0xf2, + 0x42, 0x1e, 0x98, 0xe1, 0x81, 0x4c, 0x12, 0x24, 0xa5, 0xa4, 0x21, 0x08, 0xbe, 0x65, 0xfb, + 0x26, 0x37, 0x8a, 0xc3, 0x1c, 0xa2, 0x7d, 0x23, 0x14, 0x81, 0xcb, 0x4a, 0x52, 0x49, 0xd0, + 0x21, 0x24, 0xd5, 0xb5, 0x02, 0x3a, 0xdb, 0xd0, 0x2b, 0x39, 0x6c, 0xfb, 0x66, 0xa0, 0x4c, + 0x2f, 0xe4, 0x1a, 0x5e, 0x48, 0x0a, 0x85, 0x4c, 0xc0, 0x0d, 0x39, 0xa1, 0x1b, 0x52, 0x28, + 0xec, 0xac, 0xf0, 0x13, 0x52, 0x06, 0xa4, 0x42, 0x0a, 0xc1, 0x14, 0x24, 0x17, 0x7c, 0x04, + 0x81, 0x44, 0x23, 0x9b, 0x29, 0x07, 0x20, 0x2c, 0x0f, 0x42, 0x90, 0xd0, 0xee, 0x06, 0x87, + 0x96, 0x42, 0x8a, 0x42, 0x4a, 0x2b, 0x64, 0x63, 0x12, 0x52, 0x14, 0x84, 0x9c, 0x71, 0x0a, + 0x29, 0x11, 0x27, 0x94, 0x68, 0x84, 0x43, 0xd3, 0x00, 0xa3, 0xd4, 0x88, 0x96, 0x71, 0x9b, + 0x20, 0x82, 0x43, 0xb6, 0x58, 0x85, 0xec, 0x02, 0x33, 0xc1, 0x8a, 0x15, 0x42, 0x71, 0x69, + 0x85, 0x3c, 0xfc, 0x42, 0x1e, 0xa9, 0x86, 0xbc, 0xf1, 0x30, 0xe6, 0x75, 0xe5, 0x8e, 0x79, + 0xde, 0x30, 0x24, 0x13, 0x4b, 0x6c, 0x42, 0x0e, 0x3b, 0x96, 0xa8, 0xdc, 0xb0, 0x6d, 0x6a, + 0x1a, 0x81, 0x65, 0x3a, 0xf7, 0x4d, 0x87, 0x4d, 0x21, 0x87, 0xc5, 0x83, 0x6c, 0x13, 0x28, + 0x67, 0x20, 0x8a, 0x6d, 0xe3, 0xc1, 0xfb, 0x50, 0x26, 0xab, 0x9c, 0x54, 0x75, 0x8a, 0x85, + 0x4b, 0x0c, 0x62, 0x87, 0x7c, 0xb0, 0xc1, 0x62, 0xb2, 0xd1, 0x90, 0x45, 0xc4, 0x15, 0xa2, + 0xcc, 0x0f, 0xa4, 0x62, 0x1f, 0x21, 0x31, 0x45, 0x15, 0x72, 0x59, 0xba, 0x6c, 0xc4, 0x98, + 0xb5, 0x34, 0x10, 0x15, 0xba, 0x34, 0x1b, 0x16, 0x72, 0x58, 0x4f, 0x17, 0x79, 0x54, 0x04, + 0x5c, 0xa5, 0x59, 0x2c, 0x66, 0x54, 0xdd, 0xb2, 0x65, 0x84, 0x0a, 0xaf, 0xda, 0x28, 0xf6, + 0x98, 0x85, 0x6e, 0xf2, 0x2e, 0x08, 0xa8, 0x59, 0xc8, 0x72, 0x13, 0x86, 0xb2, 0x69, 0x9d, + 0x69, 0x74, 0x11, 0x9f, 0x98, 0x3e, 0x39, 0x85, 0x74, 0x4e, 0xa6, 0x6f, 0x48, 0x86, 0x43, + 0x10, 0x72, 0xd4, 0x0d, 0xa4, 0xd1, 0xba, 0x48, 0x26, 0x8b, 0x60, 0xd1, 0x29, 0x16, 0xe8, + 0x4d, 0x30, 0x2a, 0x1d, 0x72, 0xcd, 0xa4, 0x8b, 0x7c, 0x82, 0x42, 0x32, 0xd3, 0xa4, 0x20, + 0x16, 0x12, 0xb1, 0xee, 0x59, 0xb4, 0x90, 0xa3, 0x26, 0x20, 0x2f, 0x7c, 0x20, 0x21, 0x25, + 0x95, 0x9f, 0x58, 0x68, 0x24, 0xe7, 0x65, 0x34, 0x0d, 0x7b, 0xc2, 0xb9, 0xbe, 0x2e, 0xd2, + 0xe8, 0x49, 0x0a, 0x3b, 0x29, 0xe5, 0x14, 0xe4, 0x0c, 0x18, 0x27, 0x42, 0xfe, 0x07, 0xff, + 0x83, 0xff, 0xc1, 0xff, 0x77, 0xff, 0x01, + ]; + + #[test] + fn test_expand_zeros() { + let mut dst = VecDeque::new(); + let mut src_used = 0; + hwexpand( + &ZEROS_REDUCED, + ZEROS_REDUCED.len(), + 2048 + 1024, + 4, + &mut src_used, + &mut dst, + ).unwrap(); + assert_eq!(dst.len(), 2048 + 1024); + for i in 0..1024 { + assert_eq!(dst[2048 + i], 0); + } + } +} diff --git a/src/legacy/shrink.rs b/src/legacy/shrink.rs new file mode 100644 index 000000000..d25cf0a23 --- /dev/null +++ b/src/legacy/shrink.rs @@ -0,0 +1,426 @@ +use std::collections::VecDeque; +use std::io::{self, copy, Read}; + +use thiserror::Error; + +use super::bitstream::{lsb, BitStream}; + +const MIN_CODE_SIZE: u8 = 9; +const MAX_CODE_SIZE: u8 = 13; + +const MAX_CODE: usize = (1 << MAX_CODE_SIZE) - 1; +const INVALID_CODE: u16 = u16::MAX; +const CONTROL_CODE: usize = 256; +const INC_CODE_SIZE: u64 = 1; +const PARTIAL_CLEAR: u64 = 2; + +// const HASH_BITS: usize = MAX_CODE_SIZE + 1; /* For a load factor of 0.5. */ +// const HASHTAB_SIZE: usize = 1 << HASH_BITS; +const UNKNOWN_LEN: u16 = u16::MAX; + +#[derive(Error, Debug)] +enum ShrinkError { + #[error("self-referential code")] + InvalidPrefixCode, + + #[error("first code needs to be literal")] + FirstCodeNeedsToBeLiteral, + + #[error("invalid code")] + InvalidCode, + + #[error("prev code no longer valid")] + PrevCodeNoLongerValid, +} +struct CodeQueue { + next_idx: usize, + codes: [u16; MAX_CODE as usize - CONTROL_CODE + 1], +} + +impl CodeQueue { + fn new() -> Self { + let mut codes = [0; MAX_CODE as usize - CONTROL_CODE + 1]; + for (i, code) in (CONTROL_CODE as u16 + 1..=MAX_CODE as u16).enumerate() { + codes[i] = code; + } + Self { next_idx: 0, codes } + } + + // Return the next code in the queue, or INVALID_CODE if the queue is empty. + fn next(&self) -> u16 { + // assert(q->next_idx < sizeof(q->codes) / sizeof(q->codes[0])); + self.codes[self.next_idx] + } + + /// Return and remove the next code from the queue, or return INVALID_CODE if + /// the queue is empty. + fn remove_next(&mut self) -> u16 { + let code = self.next(); + if code != INVALID_CODE { + self.next_idx += 1; + } + code + } +} + +#[derive(Clone, Copy)] +struct Codetab { + prefix_code: u16, // INVALID_CODE means the entry is invalid. + ext_byte: u8, + len: u16, + last_dst_pos: usize, +} + +impl Default for Codetab { + fn default() -> Self { + Self { + prefix_code: INVALID_CODE, + ext_byte: 0, + len: 0, + last_dst_pos: 0, + } + } +} + +impl Codetab { + pub fn new() -> [Self; MAX_CODE + 1] { + let mut codetab = [Codetab::default(); MAX_CODE + 1]; + // Codes for literal bytes. Set a phony prefix_code so they're valid. + for i in 0..=u8::MAX as usize { + codetab[i].prefix_code = i as u16; + codetab[i].ext_byte = i as u8; + codetab[i].len = 1; + } + + codetab + } +} +fn unshrink_partial_clear(codetab: &mut [Codetab], queue: &mut CodeQueue) { + let mut is_prefix = [false; MAX_CODE + 1]; + + // Scan for codes that have been used as a prefix. + for i in CONTROL_CODE + 1..=MAX_CODE { + if codetab[i].prefix_code != INVALID_CODE { + is_prefix[codetab[i].prefix_code as usize] = true; + } + } + + // Clear "non-prefix" codes in the table; populate the code queue. + let mut code_queue_size = 0; + for i in CONTROL_CODE + 1..MAX_CODE { + if !is_prefix[i] { + codetab[i].prefix_code = INVALID_CODE; + queue.codes[code_queue_size] = i as u16; + code_queue_size += 1; + } + } + queue.codes[code_queue_size] = INVALID_CODE; // End-of-queue marker. + queue.next_idx = 0; +} + +/// Read the next code from the input stream and return it in next_code. Returns +/// false if the end of the stream is reached. If the stream contains invalid +/// data, next_code is set to INVALID_CODE but the return value is still true. +fn read_code( + is: &mut BitStream, + code_size: &mut u8, + codetab: &mut [Codetab], + queue: &mut CodeQueue, + next_code: &mut u16, +) -> bool { + // assert(sizeof(code) * CHAR_BIT >= *code_size); + let code = lsb(is.bits(), *code_size) as u16; + if !is.advance(*code_size) { + return false; + } + + // Handle regular codes (the common case). + if code != CONTROL_CODE as u16 { + *next_code = code; + return true; + } + + // Handle control codes. + let control_code = lsb(is.bits(), *code_size); + if !is.advance(*code_size) { + *next_code = INVALID_CODE; + return true; + } + if control_code == INC_CODE_SIZE && *code_size < MAX_CODE_SIZE { + (*code_size) += 1; + return read_code(is, code_size, codetab, queue, next_code); + } + if control_code == PARTIAL_CLEAR { + unshrink_partial_clear(codetab, queue); + return read_code(is, code_size, codetab, queue, next_code); + } + *next_code = INVALID_CODE; + return true; +} + +/// Output the string represented by a code into dst at dst_pos. Returns +/// HWUNSHRINK_OK on success, and also updates *first_byte and *len with the +/// first byte and length of the output string, respectively. +fn output_code( + code: u16, + dst: &mut VecDeque, + prev_code: u16, + codetab: &mut [Codetab], + queue: &mut CodeQueue, + first_byte: &mut u8, + len: &mut usize, +) -> Result<(), ShrinkError> { + assert!(code <= MAX_CODE as u16 && code != CONTROL_CODE as u16); + if code <= u8::MAX as u16 { + // Output literal byte. + *first_byte = code as u8; + *len = 1; + dst.push_back(code as u8); + return Ok(()); + } + + if codetab[code as usize].prefix_code == INVALID_CODE + || codetab[code as usize].prefix_code == code + { + // Reject invalid codes. Self-referential codes may exist in + // the table but cannot be used. + return Err(ShrinkError::InvalidPrefixCode); + } + + if codetab[code as usize].len != UNKNOWN_LEN { + // Output string with known length (the common case). + let ct = &codetab[code as usize]; + for i in ct.last_dst_pos..ct.last_dst_pos + ct.len as usize { + dst.push_back(dst[i]); + } + *first_byte = dst[ct.last_dst_pos]; + *len = ct.len as usize; + return Ok(()); + } + + // Output a string of unknown length. This happens when the prefix + // was invalid (due to partial clearing) when the code was inserted into + // the table. The prefix can then become valid when it's added to the + // table at a later point. + assert!(codetab[code as usize].len == UNKNOWN_LEN); + let prefix_code = codetab[code as usize].prefix_code; + assert!(prefix_code as usize > CONTROL_CODE); + + if prefix_code == queue.next() { + /* The prefix code hasn't been added yet, but we were just + about to: the KwKwK case. Add the previous string extended + with its first byte. */ + assert!(codetab[prev_code as usize].prefix_code != INVALID_CODE); + codetab[prefix_code as usize].prefix_code = prev_code; + codetab[prefix_code as usize].ext_byte = *first_byte; + codetab[prefix_code as usize].len = codetab[prev_code as usize].len + 1; + codetab[prefix_code as usize].last_dst_pos = codetab[prev_code as usize].last_dst_pos; + dst.push_back(*first_byte); + } else if codetab[prefix_code as usize].prefix_code == INVALID_CODE { + // The prefix code is still invalid. + return Err(ShrinkError::InvalidPrefixCode); + } + + // Output the prefix string, then the extension byte. + *len = codetab[prefix_code as usize].len as usize + 1; + let last_dst_pos = dst.len(); + let ct = &codetab[prefix_code as usize]; + for i in ct.last_dst_pos..ct.last_dst_pos + ct.len as usize { + dst.push_back(dst[i]); + } + dst.push_back(codetab[code as usize].ext_byte); + *first_byte = dst[ct.last_dst_pos]; + + // Update the code table now that the string has a length and pos. + assert!(prev_code != code); + codetab[code as usize].len = *len as u16; + codetab[code as usize].last_dst_pos = last_dst_pos; + + Ok(()) +} + +fn hwunshrink( + src: &[u8], + src_len: usize, + uncompressed_size: usize, + src_used: &mut usize, + dst: &mut VecDeque, +) -> Result<(), ShrinkError> { + let mut codetab = Codetab::new(); + let mut queue = CodeQueue::new(); + let mut is = BitStream::new(src, src_len); + let mut code_size = MIN_CODE_SIZE; + + // Handle the first code separately since there is no previous code. + let mut curr_code = 0; + if !read_code( + &mut is, + &mut code_size, + &mut codetab, + &mut queue, + &mut curr_code, + ) { + *src_used = is.bytes_read(); + return Ok(()); + } + assert!(curr_code != CONTROL_CODE as u16); + if curr_code > u8::MAX as u16 { + return Err(ShrinkError::FirstCodeNeedsToBeLiteral); /* The first code must be a literal. */ + } + let mut first_byte = curr_code as u8; + codetab[curr_code as usize].last_dst_pos = dst.len(); + dst.push_back(curr_code as u8); + + let mut prev_code = curr_code; + while dst.len() < uncompressed_size && read_code( + &mut is, + &mut code_size, + &mut codetab, + &mut queue, + &mut curr_code, + ) { + println!("{}", dst.len()); + if curr_code == INVALID_CODE { + return Err(ShrinkError::InvalidCode); + } + let dst_pos = dst.len(); + // Handle KwKwK: next code used before being added. + if curr_code == queue.next() { + if codetab[prev_code as usize].prefix_code == INVALID_CODE { + // The previous code is no longer valid. + return Err(ShrinkError::PrevCodeNoLongerValid); + } + // Extend the previous code with its first byte. + assert!(curr_code != prev_code); + codetab[curr_code as usize].prefix_code = prev_code; + codetab[curr_code as usize].ext_byte = first_byte; + codetab[curr_code as usize].len = codetab[prev_code as usize].len + 1; + codetab[curr_code as usize].last_dst_pos = codetab[prev_code as usize].last_dst_pos; + // assert!(dst_pos < dst_cap); + // dst.push_back(first_byte); + } + + // Output the string represented by the current code. + let mut len = 0; + if let Err(s) = output_code( + curr_code, + dst, + prev_code, + &mut codetab, + &mut queue, + &mut first_byte, + &mut len, + ) { + return Err(s); + } + + // Verify that the output matches walking the prefixes. + let mut c = curr_code; + for i in 0..len { + assert!(codetab[c as usize].len as usize == len - i); + // assert!(codetab[c as usize].ext_byte == dst[dst_pos + len - i - 1]); + c = codetab[c as usize].prefix_code; + } + + // Add a new code to the string table if there's room. + // The string is the previous code's string extended with + // the first byte of the current code's string. + let new_code = queue.remove_next(); + if new_code != INVALID_CODE { + //assert!(codetab[prev_code as usize].last_dst_pos < dst_pos); + codetab[new_code as usize].prefix_code = prev_code; + codetab[new_code as usize].ext_byte = first_byte; + codetab[new_code as usize].len = codetab[prev_code as usize].len + 1; + codetab[new_code as usize].last_dst_pos = codetab[prev_code as usize].last_dst_pos; + + if codetab[prev_code as usize].prefix_code == INVALID_CODE { + // prev_code was invalidated in a partial + // clearing. Until that code is re-used, the + // string represented by new_code is + // indeterminate. + codetab[new_code as usize].len = UNKNOWN_LEN; + } + // If prev_code was invalidated in a partial clearing, + // it's possible that new_code==prev_code, in which + // case it will never be used or cleared. + } + + codetab[curr_code as usize].last_dst_pos = dst_pos; + prev_code = curr_code; + } + + *src_used = is.bytes_read(); + Ok(()) +} + +#[derive(Debug)] +pub struct ShrinkDecoder { + compressed_reader: R, + stream_read: bool, + uncompressed_size: u64, + stream: VecDeque, +} + +impl ShrinkDecoder { + pub fn new(inner: R, uncompressed_size: u64) -> Self { + ShrinkDecoder { + compressed_reader: inner, + uncompressed_size, + stream_read: false, + stream: VecDeque::new(), + } + } + + pub fn finish(mut self) -> std::io::Result> { + copy(&mut self.compressed_reader, &mut self.stream)?; + Ok(self.stream) + } +} + +impl Read for ShrinkDecoder { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + if !self.stream_read { + self.stream_read = true; + let mut compressed_bytes = Vec::new(); + if let Err(err) = self.compressed_reader.read_to_end(&mut compressed_bytes) { + return Err(err.into()); + } + let mut src_used = compressed_bytes.len(); + if let Err(err) = hwunshrink( + &compressed_bytes, + compressed_bytes.len(), + self.uncompressed_size as usize, + &mut src_used, + &mut self.stream, + ) { + return Err(io::Error::new(io::ErrorKind::InvalidData, err.to_string())); + } + } + let bytes_read = self.stream.len().min(buf.len()); + buf[..bytes_read].copy_from_slice(&self.stream.drain(..bytes_read).collect::>()); + Ok(bytes_read) + } +} + +#[cfg(test)] +mod tests { + use std::collections::VecDeque; + use crate::legacy::shrink::hwunshrink; + + const LZW_FIG5: &[u8; 17] = b"ababcbababaaaaaaa"; + const LZW_FIG5_SHRUNK: [u8; 12] = [ 0x61, 0xc4, 0x04, 0x1c, 0x23, 0xb0, 0x60, 0x98, 0x83, 0x08, 0xc3, 0x00 ]; + + #[test] + fn test_unshrink_lzw_fig5() { + let mut dst = VecDeque::new(); + let mut src_used = 0; + hwunshrink( + &LZW_FIG5_SHRUNK, + LZW_FIG5_SHRUNK.len(), + LZW_FIG5.len(), + &mut src_used, + &mut dst + ).unwrap(); + assert_eq!(dst, LZW_FIG5); + } +} \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index 35ffcaa68..f693aa4cc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -47,6 +47,8 @@ mod types; pub mod write; mod zipcrypto; pub use extra_fields::ExtraField; +#[cfg(feature = "legacy-zip")] +mod legacy; #[doc = "Unstable APIs\n\ \ diff --git a/src/read.rs b/src/read.rs index f736c0667..56855dfe7 100644 --- a/src/read.rs +++ b/src/read.rs @@ -6,6 +6,10 @@ use crate::compression::CompressionMethod; use crate::cp437::FromCp437; use crate::crc32::Crc32Reader; use crate::extra_fields::{ExtendedTimestamp, ExtraField}; +#[cfg(feature = "legacy-zip")] +use crate::legacy::ShrinkDecoder; +#[cfg(feature = "legacy-zip")] +use crate::legacy::{ImplodeDecoder, ReduceDecoder}; use crate::read::zip_archive::Shared; use crate::result::{ZipError, ZipResult}; use crate::spec; @@ -142,6 +146,12 @@ pub(crate) enum ZipFileReader<'a> { NoReader, Raw(io::Take<&'a mut dyn Read>), Stored(Crc32Reader>), + #[cfg(feature = "legacy-zip")] + Shrink(Crc32Reader>>), + #[cfg(feature = "legacy-zip")] + Reduce(Crc32Reader>>), + #[cfg(feature = "legacy-zip")] + Implode(Crc32Reader>>), #[cfg(feature = "_deflate-any")] Deflated(Crc32Reader>>), #[cfg(feature = "deflate64")] @@ -160,6 +170,12 @@ impl<'a> Read for ZipFileReader<'a> { ZipFileReader::NoReader => panic!("ZipFileReader was in an invalid state"), ZipFileReader::Raw(r) => r.read(buf), ZipFileReader::Stored(r) => r.read(buf), + #[cfg(feature = "legacy-zip")] + ZipFileReader::Shrink(r) => r.read(buf), + #[cfg(feature = "legacy-zip")] + ZipFileReader::Reduce(r) => r.read(buf), + #[cfg(feature = "legacy-zip")] + ZipFileReader::Implode(r) => r.read(buf), #[cfg(feature = "_deflate-any")] ZipFileReader::Deflated(r) => r.read(buf), #[cfg(feature = "deflate64")] @@ -181,6 +197,34 @@ impl<'a> ZipFileReader<'a> { ZipFileReader::NoReader => panic!("ZipFileReader was in an invalid state"), ZipFileReader::Raw(r) => r, ZipFileReader::Stored(r) => r.into_inner().into_inner(), + #[cfg(feature = "legacy-zip")] + ZipFileReader::Shrink(r) => { + // Lzma reader owns its buffer rather than mutably borrowing it, so we have to drop + // it separately + if let Ok(mut remaining) = r.into_inner().finish() { + let _ = copy(&mut remaining, &mut sink()); + } + return; + } + #[cfg(feature = "legacy-zip")] + ZipFileReader::Reduce(r) => { + // Lzma reader owns its buffer rather than mutably borrowing it, so we have to drop + // it separately + if let Ok(mut remaining) = r.into_inner().finish() { + let _ = copy(&mut remaining, &mut sink()); + } + return; + } + #[cfg(feature = "legacy-zip")] + ZipFileReader::Implode(r) => { + // Lzma reader owns its buffer rather than mutably borrowing it, so we have to drop + // it separately + if let Ok(mut remaining) = r.into_inner().finish() { + let _ = copy(&mut remaining, &mut sink()); + } + return; + } + #[cfg(feature = "_deflate-any")] ZipFileReader::Deflated(r) => r.into_inner().into_inner().into_inner(), #[cfg(feature = "deflate64")] @@ -196,6 +240,7 @@ impl<'a> ZipFileReader<'a> { if let Ok(mut remaining) = r.into_inner().finish() { let _ = copy(&mut remaining, &mut sink()); } + return; } }; @@ -282,10 +327,13 @@ pub(crate) fn make_crypto_reader<'a>( Ok(reader) } +#[allow(unused_variables)] pub(crate) fn make_reader( compression_method: CompressionMethod, crc32: u32, reader: CryptoReader, + uncompressed_size: u64, + flags: u16, ) -> ZipResult { let ae2_encrypted = reader.is_ae2_encrypted(); @@ -295,6 +343,34 @@ pub(crate) fn make_reader( crc32, ae2_encrypted, ))), + #[cfg(feature = "legacy-zip")] + CompressionMethod::Shrink => { + let reader = ShrinkDecoder::new(reader, uncompressed_size); + Ok(ZipFileReader::Shrink(Crc32Reader::new( + reader, + crc32, + ae2_encrypted, + ))) + } + #[cfg(feature = "legacy-zip")] + CompressionMethod::Reduce(comp_factor) => { + let reader = ReduceDecoder::new(reader, uncompressed_size, comp_factor); + Ok(ZipFileReader::Reduce(Crc32Reader::new( + reader, + crc32, + ae2_encrypted, + ))) + } + #[cfg(feature = "legacy-zip")] + CompressionMethod::Implode => { + let reader = ImplodeDecoder::new(reader, uncompressed_size, flags); + Ok(ZipFileReader::Implode(Crc32Reader::new( + reader, + crc32, + ae2_encrypted, + ))) + } + #[cfg(feature = "_deflate-any")] CompressionMethod::Deflated => { let deflate_reader = DeflateDecoder::new(reader); @@ -994,6 +1070,7 @@ fn central_header_to_zip_file_inner( crc32, compressed_size: compressed_size as u64, uncompressed_size: uncompressed_size as u64, + flags, file_name, file_name_raw: file_name_raw.into(), extra_field: Some(Arc::new(extra_field)), @@ -1126,7 +1203,13 @@ impl<'a> ZipFile<'a> { if let ZipFileReader::NoReader = self.reader { let data = &self.data; let crypto_reader = self.crypto_reader.take().expect("Invalid reader state"); - self.reader = make_reader(data.compression_method, data.crc32, crypto_reader)?; + self.reader = make_reader( + data.compression_method, + data.crc32, + crypto_reader, + data.uncompressed_size, + data.flags, + )?; } Ok(&mut self.reader) } @@ -1379,6 +1462,7 @@ pub fn read_zipfile_from_stream<'a, R: Read>(reader: &'a mut R) -> ZipResult(reader: &'a mut R) -> ZipResult, /// Raw file name. To be used when file_name was incorrectly decoded. @@ -555,6 +557,7 @@ mod test { crc32: 0, compressed_size: 0, uncompressed_size: 0, + flags: 0, file_name: file_name.clone().into_boxed_str(), file_name_raw: file_name.into_bytes().into_boxed_slice(), extra_field: None, diff --git a/src/write.rs b/src/write.rs index d1686cc06..a80bccf5b 100644 --- a/src/write.rs +++ b/src/write.rs @@ -793,6 +793,7 @@ impl ZipWriter { version_made_by: DEFAULT_VERSION, encrypted: options.encrypt_with.is_some(), using_data_descriptor: false, + flags: 0, compression_method, compression_level: options.compression_level, last_modified_time: options.last_modified_time, @@ -1514,6 +1515,18 @@ impl GenericZipWriter { Ok(Box::new(|bare| Storer(bare))) } } + #[cfg(feature = "legacy-zip")] + CompressionMethod::Shrink => Err(ZipError::UnsupportedArchive( + "Shrink compression unsupported", + )), + #[cfg(feature = "legacy-zip")] + CompressionMethod::Reduce(_) => Err(ZipError::UnsupportedArchive( + "Reduce compression unsupported", + )), + #[cfg(feature = "legacy-zip")] + CompressionMethod::Implode => Err(ZipError::UnsupportedArchive( + "Implode compression unsupported", + )), #[cfg(feature = "_deflate-any")] CompressionMethod::Deflated => { let default = if cfg!(feature = "deflate-zopfli") { diff --git a/tests/data/folder/first.txt b/tests/data/folder/first.txt new file mode 100644 index 000000000..9345759e4 --- /dev/null +++ b/tests/data/folder/first.txt @@ -0,0 +1 @@ +The play of Hamlet is above all others the most stupendous monument of Shakespeare's genius, standing as a beacon to command the wonder and admiration of the world, and as a memorial to future generations, that the mind of its author was moved by little less than inspiration. Lear, with its sublime picture of human misery;—Othello, with its harrowing overthrow of a nature great and amiable;—Macbeth, with its fearful murder of a monarch, whose "virtues plead like angels trumpet-tongued against the deep damnation of his taking off,"—severally exhibit, in the most pre-eminent degree, all those mighty elements which constitute the perfection of tragic art—the grand, the pitiful, and the terrible. Hamlet is a history of mind—a tragedy of thought. It contains the deepest philosophy, and most profound wisdom; yet speaks the language of the heart, touching the secret spring of every sense and feeling. Here we have no ideal exaltation of character, but life with its blended faults ands,—a gentle nature unstrung by passing events, and thus rendered "out of tune and harsh." diff --git a/tests/data/implode.zip b/tests/data/implode.zip new file mode 100644 index 0000000000000000000000000000000000000000..9ce50ed2a6d47d0ea4955e225c68e6489b45467a GIT binary patch literal 800 zcmWIWW@h1HU;{#yjcX$Es-`NfVPar#VPRn4WME)O%PcA`(JQGav9XWi{O8BS$7aWA z$I8KGSI@DZgHxW9gP%iRVn4Ge3zH}dGe4UMGYc!fGK&}oYdqV3efIxAaGw4Dem-7q z1|en^W)W8Hs^7n7b8#^Uu`n^SvQ_{2^QSTF`^F-7d;gjVyEWGTeYHQh^2T4i)90g0 z7uTB>HIU@N#~=sI)9G5U(r_{yJq9aX`}LVy0a;QK5wyk6_q)2y|7^w;11HE(}f|HJ1U{3qT{{CDB1)`54C zt74AU=BS>u_OF>LT3xsCS8qnZL8m76d#kih%jmB8CQ+%e?|GtmlEZwT-kVxa47q=F-@T(E;-RbSZ$C^cQ<~VvBPO&b{zh-?w2S*4 zR!xo4{hOU7aP$~!?a^6WG4VBbmR^0qmMpFvxAjGQrOzGvEm0?5P5${Nck7zD$;b0l z^g8)6-~90UC#M_XQ84du@_8%W(A8^Q!ioYEwo4~3J=HzIIpAWnox?>g86A}$X>V6A zUK29!=xVW#jJYEB9IiN*EQ?!f^uBUUmzVe!OaCV~*VvbQwbF0;^J>_WkU zobT@*iKnqDbEp4hfBN`U{~6=#1<_`E*JSm_xw^2)1b8ztaWUXd)WCEN1PWjhDTxPo Tv$BE27=h3nNM8o#0|o{F>=A7& literal 0 HcmV?d00001 diff --git a/tests/data/reduce.zip b/tests/data/reduce.zip new file mode 100644 index 0000000000000000000000000000000000000000..c413c87aef8083f5e872e6a71b883c01dff026f9 GIT binary patch literal 1058 zcmWIWW@h1HU|?Wn_`YFHL|)ZYrFF~<3@$7T44gpGw9KO761|d&5+EBBI52Qv3KB0O zz{tcbh@_ll3W%f7;K1Z0l@j2#5!t%O|vP&pUnk=COnyNdYVr6lZKW6 z>Fp;kw*`6#%+%CkKP2Tc%Tk#^Y~3 z4S09k`r!X-e1=ngfAf3L61hF`T(qSG_vYIMYa|pTOj{ETWn~VY;Au{qv}>JAF4M{d zW{DCq%3c@t%@r2eI6LgzmNznoIKA0kNiisWu=Lnj%GaL1(D}q^kqe2JA52hpIT0hK z=dmgA_O6YtGE$t9Wxi);nVt@Ek$HB7Pq0!>j=%TlvXh1%JegxGb2M+W6>y~3DNjq? z=)pUeV>9phDS2||7AX2YbGVqnVyQMmXVRp;2NTj{lDVraUz)Hp78EwL9G6r*p%i@S z$k~(=%6tOCd*b#EuK5ihkGW&-`@Ky4hA>3=axG>(=FzmE>4l7N8$0L;N#kN^Mx literal 0 HcmV?d00001 diff --git a/tests/data/shrink.zip b/tests/data/shrink.zip new file mode 100644 index 0000000000000000000000000000000000000000..efaf97077e89bbbc762199dedbdb1d54d4591200 GIT binary patch literal 825 zcmWIWW@h1HU|?Wm;NGw%BCl$y(orS`1{W3v22P-;n`cmPh+arUNXUgLjEwERGi(&@ z&0uU)@sT!5ebL}y%|6h1>jQt>qM-wtXr z-0aHL+yXuuPDNI^o6PE5JoB0$^KG?*CdqQ{=Q@`yyL?FG=ChlMd;&|=Rjl_Z=E+pL z&MSJdM^nypp_`?`)%z#CZ}`TZU_Q_DROd!w(zDbZ)%laYYI%H8INNNXyl3NavkBL? zosBGAcEK>CWA;4bfR?#C`|E{TB$nyq%+KrJB&oSv&D%qdORXL_71lnh1s}By3q&88M1$B0dZm!u08Ogz3Ub!N(%8C(aygcTO=xpIt8;YpxY zK*ZX@+YW4JEnYiVzEzzNP?EtW+~!$z@}Y$DQWdp5mQz2RvRE?l%0aiHlvK9W zDduO@RO%G2G~Qff>ity8aphUY{C(!B&%z=Vj Date: Mon, 13 May 2024 05:18:46 +0200 Subject: [PATCH 02/50] Update src/legacy/bitstream.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Chris Hennick <4961925+Pr0methean@users.noreply.github.com> Signed-off-by: Mike Krüger --- src/legacy/bitstream.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/legacy/bitstream.rs b/src/legacy/bitstream.rs index 3fdb55ee9..0cd5a4ad3 100644 --- a/src/legacy/bitstream.rs +++ b/src/legacy/bitstream.rs @@ -1,7 +1,7 @@ /// Get the n least significant bits of x. pub fn lsb(x: u64, n: u8) -> u64 { assert!(n <= 63); - x & ((1u64.wrapping_shl(n as u32)) - 1u64) + x & (1u64 << (n as u32) - 1) } /// Reverse the n least significant bits of x. From 77e247d28ea01289a2250cbf28aff9db72237ba1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Mon, 13 May 2024 05:19:09 +0200 Subject: [PATCH 03/50] Update src/legacy/reduce.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Chris Hennick <4961925+Pr0methean@users.noreply.github.com> Signed-off-by: Mike Krüger --- src/legacy/reduce.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/legacy/reduce.rs b/src/legacy/reduce.rs index 29f1068ae..e25189cdd 100644 --- a/src/legacy/reduce.rs +++ b/src/legacy/reduce.rs @@ -479,8 +479,8 @@ mod tests { &mut dst, ).unwrap(); assert_eq!(dst.len(), 2048 + 1024); - for i in 0..1024 { - assert_eq!(dst[2048 + i], 0); + for i in 0..(1 << 10) { + assert_eq!(dst[(1 << 11) + i], 0); } } } From ad8efb66f52e96e98fa2f2aeb9aa69a39cdfb326 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Mon, 13 May 2024 05:21:11 +0200 Subject: [PATCH 04/50] Formatted some comments. --- src/legacy/huffman.rs | 40 ++++++++++++++++++++-------------------- src/legacy/implode.rs | 13 ++++++++----- src/legacy/reduce.rs | 12 +++++++----- src/legacy/shrink.rs | 35 ++++++++++++++++++++--------------- 4 files changed, 55 insertions(+), 45 deletions(-) diff --git a/src/legacy/huffman.rs b/src/legacy/huffman.rs index c61988469..c6737bf31 100644 --- a/src/legacy/huffman.rs +++ b/src/legacy/huffman.rs @@ -107,7 +107,7 @@ impl HuffmanDecoder { let codeword = reverse16(codeword, len); // Make it LSB-first. let pad_len = HUFFMAN_LOOKUP_TABLE_BITS as usize - len; - // Pad the pad_len upper bits with all bit combinations. + // Pad the pad_len upper bits with all bit combinations. for padding in 0..(1 << pad_len) { let index = (codeword | (padding << len)) as usize; assert!(sym <= u16::MAX as usize); @@ -158,25 +158,25 @@ mod tests { #[test] fn test_huffman_decode_basic() { let lens = [ - 3, // sym 0: 000 - 3, // sym 1: 001 - 3, // sym 2: 010 - 3, // sym 3: 011 - 3, // sym 4: 100 - 3, // sym 5: 101 - 4, // sym 6: 1100 - 4, // sym 7: 1101 - 0, // sym 8: - 0, // sym 9: - 0, // sym 10: - 0, // sym 11: - 0, // sym 12: - 0, // sym 13: - 0, // sym 14: - 0, // sym 15: - 6, // sym 16: 111110 - 5, // sym 17: 11110 - 4, // sym 18: 1110 + 3, // sym 0: 000 + 3, // sym 1: 001 + 3, // sym 2: 010 + 3, // sym 3: 011 + 3, // sym 4: 100 + 3, // sym 5: 101 + 4, // sym 6: 1100 + 4, // sym 7: 1101 + 0, // sym 8: + 0, // sym 9: + 0, // sym 10: + 0, // sym 11: + 0, // sym 12: + 0, // sym 13: + 0, // sym 14: + 0, // sym 15: + 6, // sym 16: 111110 + 5, // sym 17: 11110 + 4, // sym 18: 1110 ]; let mut d = HuffmanDecoder::default(); diff --git a/src/legacy/implode.rs b/src/legacy/implode.rs index 5ee3e252e..090a5627d 100644 --- a/src/legacy/implode.rs +++ b/src/legacy/implode.rs @@ -17,7 +17,7 @@ enum ImplodeError { #[error("Too many codeword lengths")] TooManyCodewordLengths, - + #[error("Too few codeword lengths")] TooFewCodewordLengths, @@ -30,7 +30,11 @@ enum ImplodeError { /// Initialize the Huffman decoder d with num_lens codeword lengths read from is. /// Returns false if the input is invalid. -fn read_huffman_code(is: &mut BitStream, num_lens: usize, d: &mut HuffmanDecoder) -> core::result::Result<(), ImplodeError> { +fn read_huffman_code( + is: &mut BitStream, + num_lens: usize, + d: &mut HuffmanDecoder, +) -> core::result::Result<(), ImplodeError> { let mut lens = [0; 256]; let mut len_count = [0; 17]; // assert!(num_lens <= sizeof(lens) / sizeof(lens[0])); @@ -40,7 +44,6 @@ fn read_huffman_code(is: &mut BitStream, num_lens: usize, d: &mut HuffmanDecoder let num_bytes = (byte + 1) as usize; if !is.advance(8) { return Err(ImplodeError::EndOfStream); - } let mut codeword_idx = 0; @@ -92,7 +95,6 @@ fn read_huffman_code(is: &mut BitStream, num_lens: usize, d: &mut HuffmanDecoder Ok(()) } - fn hwexplode( src: &[u8], src_len: usize, @@ -311,7 +313,8 @@ mod tests { false, &mut src_used, &mut dst, - ).unwrap(); + ) + .unwrap(); assert_eq!(dst.len(), 256); } } diff --git a/src/legacy/reduce.rs b/src/legacy/reduce.rs index e25189cdd..ba65fe91b 100644 --- a/src/legacy/reduce.rs +++ b/src/legacy/reduce.rs @@ -128,7 +128,7 @@ fn hwexpand( comp_factor: u8, src_used: &mut usize, dst: &mut VecDeque, -) -> core::result::Result<(), ReduceError> { +) -> core::result::Result<(), ReduceError> { let mut fsets = [FollowerSet::default(); 256]; assert!(comp_factor >= 1 && comp_factor <= 4); @@ -137,10 +137,10 @@ fn hwexpand( return Err(ReduceError::InvalidFollowerSet); } - // Number of bits in V used for backref length. + // Number of bits in V used for backref length. let v_len_bits = 8 - comp_factor; - let mut curr_byte = 0; // The first "previous byte" is implicitly zero. + let mut curr_byte = 0; // The first "previous byte" is implicitly zero. while dst.len() < uncomp_len { // Read a literal byte or DLE marker. @@ -361,7 +361,8 @@ mod tests { 4, &mut src_used, &mut dst, - ).unwrap(); + ) + .unwrap(); assert_eq!(dst.len(), 2048); } @@ -477,7 +478,8 @@ mod tests { 4, &mut src_used, &mut dst, - ).unwrap(); + ) + .unwrap(); assert_eq!(dst.len(), 2048 + 1024); for i in 0..(1 << 10) { assert_eq!(dst[(1 << 11) + i], 0); diff --git a/src/legacy/shrink.rs b/src/legacy/shrink.rs index d25cf0a23..a70d1e4e3 100644 --- a/src/legacy/shrink.rs +++ b/src/legacy/shrink.rs @@ -201,7 +201,7 @@ fn output_code( // Output a string of unknown length. This happens when the prefix // was invalid (due to partial clearing) when the code was inserted into // the table. The prefix can then become valid when it's added to the - // table at a later point. + // table at a later point. assert!(codetab[code as usize].len == UNKNOWN_LEN); let prefix_code = codetab[code as usize].prefix_code; assert!(prefix_code as usize > CONTROL_CODE); @@ -217,7 +217,7 @@ fn output_code( codetab[prefix_code as usize].last_dst_pos = codetab[prev_code as usize].last_dst_pos; dst.push_back(*first_byte); } else if codetab[prefix_code as usize].prefix_code == INVALID_CODE { - // The prefix code is still invalid. + // The prefix code is still invalid. return Err(ShrinkError::InvalidPrefixCode); } @@ -272,13 +272,15 @@ fn hwunshrink( dst.push_back(curr_code as u8); let mut prev_code = curr_code; - while dst.len() < uncompressed_size && read_code( - &mut is, - &mut code_size, - &mut codetab, - &mut queue, - &mut curr_code, - ) { + while dst.len() < uncompressed_size + && read_code( + &mut is, + &mut code_size, + &mut codetab, + &mut queue, + &mut curr_code, + ) + { println!("{}", dst.len()); if curr_code == INVALID_CODE { return Err(ShrinkError::InvalidCode); @@ -297,7 +299,7 @@ fn hwunshrink( codetab[curr_code as usize].len = codetab[prev_code as usize].len + 1; codetab[curr_code as usize].last_dst_pos = codetab[prev_code as usize].last_dst_pos; // assert!(dst_pos < dst_cap); - // dst.push_back(first_byte); + // dst.push_back(first_byte); } // Output the string represented by the current code. @@ -404,11 +406,13 @@ impl Read for ShrinkDecoder { #[cfg(test)] mod tests { - use std::collections::VecDeque; use crate::legacy::shrink::hwunshrink; + use std::collections::VecDeque; const LZW_FIG5: &[u8; 17] = b"ababcbababaaaaaaa"; - const LZW_FIG5_SHRUNK: [u8; 12] = [ 0x61, 0xc4, 0x04, 0x1c, 0x23, 0xb0, 0x60, 0x98, 0x83, 0x08, 0xc3, 0x00 ]; + const LZW_FIG5_SHRUNK: [u8; 12] = [ + 0x61, 0xc4, 0x04, 0x1c, 0x23, 0xb0, 0x60, 0x98, 0x83, 0x08, 0xc3, 0x00, + ]; #[test] fn test_unshrink_lzw_fig5() { @@ -419,8 +423,9 @@ mod tests { LZW_FIG5_SHRUNK.len(), LZW_FIG5.len(), &mut src_used, - &mut dst - ).unwrap(); + &mut dst, + ) + .unwrap(); assert_eq!(dst, LZW_FIG5); } -} \ No newline at end of file +} From 334b1101486b90b9642492bbe4ec14c00af2d19d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Mon, 13 May 2024 06:21:00 +0200 Subject: [PATCH 05/50] Added some code review changes. --- Cargo.toml | 1 + src/legacy/bitstream.rs | 49 +++++++------- src/legacy/huffman.rs | 19 +++--- src/legacy/implode.rs | 98 ++++++++++++---------------- src/legacy/lz77.rs | 2 +- src/legacy/reduce.rs | 137 ++++++++++++++++++++-------------------- src/legacy/shrink.rs | 86 +++++++++++++------------ 7 files changed, 191 insertions(+), 201 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 7e31a0b03..9b8869909 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -84,6 +84,7 @@ default = [ "lzma", "time", "zstd", + "legacy-zip", ] [[bench]] diff --git a/src/legacy/bitstream.rs b/src/legacy/bitstream.rs index 0cd5a4ad3..b80a9715d 100644 --- a/src/legacy/bitstream.rs +++ b/src/legacy/bitstream.rs @@ -1,20 +1,22 @@ +use std::io; + /// Get the n least significant bits of x. pub fn lsb(x: u64, n: u8) -> u64 { - assert!(n <= 63); - x & (1u64 << (n as u32) - 1) + debug_assert!(n <= 63); + x & ((1u64 << (n as u32)) - 1) } /// Reverse the n least significant bits of x. /// The (16 - n) most significant bits of the result will be zero. pub fn reverse16(x: u16, n: usize) -> u16 { - assert!(n > 0); - assert!(n <= 16); + debug_assert!(n > 0); + debug_assert!(n <= 16); return x.reverse_bits() >> (16 - n); } /* pub fn round_up(x: usize, m: usize) -> usize { - assert!((m & (m - 1)) == 0, "m must be a power of two"); + debug_assert!((m & (m - 1)) == 0, "m must be a power of two"); (x + m - 1) & (-(m as i64)) as usize // Hacker's Delight (2nd), 3-1. } */ @@ -41,7 +43,7 @@ impl<'a> BitStream<'a> { /// fewer if the end of stream is reached. The upper bits are zero-padded. pub fn bits(&mut self) -> u64 { let next = self.bitpos / 8; - assert!(next < self.src.len(), "Cannot read past end of stream."); + debug_assert!(next < self.src.len(), "Cannot read past end of stream."); let bits = if next + 8 <= self.src.len() { // Common case: read 8 bytes in one go. @@ -60,23 +62,26 @@ impl<'a> BitStream<'a> { /// Advance n bits in the bitstream if possible. Returns false if that many bits /// are not available in the stream. - pub fn advance(&mut self, n: u8) -> bool { - assert!(self.bitpos <= self.bitpos_end); + pub fn advance(&mut self, n: u8) -> std::io::Result<()> { + debug_assert!(self.bitpos <= self.bitpos_end); if self.bitpos_end - self.bitpos < n as usize { - return false; + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "End of stream", + )); } self.bitpos += n as usize; - return true; + Ok(()) } /// Align the input stream to the next 8-bit boundary and return a pointer to /// that byte, which may be the past-the-end-of-stream byte. pub fn _byte_align(&mut self) -> usize { - assert!(self.bitpos <= self.bitpos_end, "Not past end of stream."); + debug_assert!(self.bitpos <= self.bitpos_end, "Not past end of stream."); self.bitpos = 8 * (self.bitpos / 8); - assert!(self.bitpos <= self.bitpos_end, "Not past end of stream."); + debug_assert!(self.bitpos <= self.bitpos_end, "Not past end of stream."); return self.bitpos / 8; } @@ -126,21 +131,21 @@ mod tests { let mut is = super::BitStream::new(&bits, 1); assert_eq!(lsb(is.bits(), 1), 1); - assert!(is.advance(1)); + is.advance(1).unwrap(); assert_eq!(lsb(is.bits(), 1), 1); - assert!(is.advance(1)); + is.advance(1).unwrap(); assert_eq!(lsb(is.bits(), 1), 1); - assert!(is.advance(1)); + is.advance(1).unwrap(); assert_eq!(lsb(is.bits(), 1), 0); - assert!(is.advance(1)); + is.advance(1).unwrap(); assert_eq!(lsb(is.bits(), 1), 0); - assert!(is.advance(1)); + is.advance(1).unwrap(); assert_eq!(lsb(is.bits(), 1), 0); - assert!(is.advance(1)); + is.advance(1).unwrap(); assert_eq!(lsb(is.bits(), 1), 1); - assert!(is.advance(1)); + is.advance(1).unwrap(); assert_eq!(lsb(is.bits(), 1), 0); - assert!(is.advance(1)); + is.advance(1).unwrap(); } #[test] @@ -148,9 +153,9 @@ mod tests { let bits = [0x45, 048]; let mut is = super::BitStream::new(&bits, 9); assert_eq!(lsb(is.bits(), 3), 0x05); - assert!(is.advance(3)); + is.advance(3).unwrap(); assert_eq!(lsb(is.bits(), 4), 0x08); - assert!(is.advance(4)); + is.advance(4).unwrap(); } } diff --git a/src/legacy/huffman.rs b/src/legacy/huffman.rs index c6737bf31..fe0bd69ab 100644 --- a/src/legacy/huffman.rs +++ b/src/legacy/huffman.rs @@ -57,12 +57,11 @@ impl HuffmanDecoder { // Count the number of codewords of each length. for i in 0..n { - assert!(lengths[i] as usize <= MAX_HUFFMAN_BITS); + debug_assert!(lengths[i] as usize <= MAX_HUFFMAN_BITS); count[lengths[i] as usize] += 1; } count[0] = 0; // Ignore zero-length codewords. - - // Compute sentinel_bits and offset_first_sym_idx for each length. + // Compute sentinel_bits and offset_first_sym_idx for each length. code[0] = 0; sym_idx[0] = 0; for l in 1..=MAX_HUFFMAN_BITS { @@ -76,7 +75,7 @@ impl HuffmanDecoder { let s = ((code[l] as u32 + count[l] as u32) << (MAX_HUFFMAN_BITS - l)) as u32; self.sentinel_bits[l] = s; - assert!(self.sentinel_bits[l] >= code[l] as u32, "No overflow!"); + debug_assert!(self.sentinel_bits[l] >= code[l] as u32, "No overflow!"); sym_idx[l] = sym_idx[l - 1] + count[l - 1]; self.offset_first_sym_idx[l] = sym_idx[l].wrapping_sub(code[l]); @@ -102,7 +101,7 @@ impl HuffmanDecoder { } pub fn table_insert(&mut self, sym: usize, len: usize, codeword: u16) { - assert!(len <= HUFFMAN_LOOKUP_TABLE_BITS as usize); + debug_assert!(len <= HUFFMAN_LOOKUP_TABLE_BITS as usize); let codeword = reverse16(codeword, len); // Make it LSB-first. let pad_len = HUFFMAN_LOOKUP_TABLE_BITS as usize - len; @@ -110,9 +109,9 @@ impl HuffmanDecoder { // Pad the pad_len upper bits with all bit combinations. for padding in 0..(1 << pad_len) { let index = (codeword | (padding << len)) as usize; - assert!(sym <= u16::MAX as usize); + debug_assert!(sym <= u16::MAX as usize); self.table[index].sym = sym as u16; - assert!(len <= u8::MAX as usize); + debug_assert!(len <= u8::MAX as usize); self.table[index].len = len as u8; } } @@ -124,11 +123,11 @@ impl HuffmanDecoder { pub fn huffman_decode(&mut self, bits: u16, num_used_bits: &mut u8) -> i32 { // First try the lookup table. let lookup_bits = lsb(bits as u64, HUFFMAN_LOOKUP_TABLE_BITS) as usize; - assert!(lookup_bits < self.table.len()); + debug_assert!(lookup_bits < self.table.len()); if self.table[lookup_bits].len != 0 { - assert!(self.table[lookup_bits].len <= HUFFMAN_LOOKUP_TABLE_BITS); - // assert!(self.table[lookup_bits].sym < self.num_syms); + debug_assert!(self.table[lookup_bits].len <= HUFFMAN_LOOKUP_TABLE_BITS); + // debug_assert!(self.table[lookup_bits].sym < self.num_syms); *num_used_bits = self.table[lookup_bits].len; return self.table[lookup_bits].sym as i32; } diff --git a/src/legacy/implode.rs b/src/legacy/implode.rs index 090a5627d..7e36aab49 100644 --- a/src/legacy/implode.rs +++ b/src/legacy/implode.rs @@ -1,7 +1,5 @@ use std::collections::VecDeque; -use std::io::{self, copy, Read, Result}; - -use thiserror::Error; +use std::io::{self, copy, Error, Read, Result}; use crate::legacy::bitstream::{lsb, ISTREAM_MIN_BITS}; use crate::legacy::lz77::lz77_output_backref; @@ -10,88 +8,78 @@ use super::bitstream::BitStream; use super::huffman::HuffmanDecoder; //const COMPRESSED_BYTES_TO_BUFFER: usize = 4096; -#[derive(Error, Debug)] -enum ImplodeError { - #[error("End of stream")] - EndOfStream, - - #[error("Too many codeword lengths")] - TooManyCodewordLengths, - - #[error("Too few codeword lengths")] - TooFewCodewordLengths, - - #[error("Higher count than available codewords")] - HigherCountThanAvailableCodewords, - - #[error("Not all codewords used")] - NotAllCodewordsUsed, -} - /// Initialize the Huffman decoder d with num_lens codeword lengths read from is. /// Returns false if the input is invalid. fn read_huffman_code( is: &mut BitStream, num_lens: usize, d: &mut HuffmanDecoder, -) -> core::result::Result<(), ImplodeError> { +) -> std::io::Result<()> { let mut lens = [0; 256]; let mut len_count = [0; 17]; - // assert!(num_lens <= sizeof(lens) / sizeof(lens[0])); + // debug_assert!(num_lens <= sizeof(lens) / sizeof(lens[0])); // Number of bytes representing the Huffman code. let byte = lsb(is.bits(), 8); let num_bytes = (byte + 1) as usize; - if !is.advance(8) { - return Err(ImplodeError::EndOfStream); - } + is.advance(8)?; let mut codeword_idx = 0; for _byte_idx in 0..num_bytes { let byte = lsb(is.bits(), 8); - if !is.advance(8) { - return Err(ImplodeError::EndOfStream); - } + is.advance(8)?; let codeword_len = (byte & 0xf) + 1; /* Low four bits plus one. */ let run_length = (byte >> 4) + 1; /* High four bits plus one. */ - assert!(codeword_len >= 1 && codeword_len <= 16); - //assert!(codeword_len < sizeof(len_count) / sizeof(len_count[0])); + debug_assert!(codeword_len >= 1 && codeword_len <= 16); + //debug_assert!(codeword_len < sizeof(len_count) / sizeof(len_count[0])); len_count[codeword_len as usize] += run_length; if (codeword_idx + run_length) as usize > num_lens { - return Err(ImplodeError::TooManyCodewordLengths); + return Err(Error::new( + io::ErrorKind::InvalidData, + "Too many codeword lengths", + )); } for _ in 0..run_length { - assert!((codeword_idx as usize) < num_lens); + debug_assert!((codeword_idx as usize) < num_lens); lens[codeword_idx as usize] = codeword_len as u8; codeword_idx += 1; } } - assert!(codeword_idx as usize <= num_lens); + debug_assert!(codeword_idx as usize <= num_lens); if (codeword_idx as usize) < num_lens { - return Err(ImplodeError::TooFewCodewordLengths); + return Err(Error::new( + io::ErrorKind::InvalidData, + "Not enough codeword lengths", + )); } // Check that the Huffman tree is full. let mut avail_codewords = 1; for i in 1..=16 { - assert!(avail_codewords >= 0); + debug_assert!(avail_codewords >= 0); avail_codewords *= 2; avail_codewords -= len_count[i] as i32; if avail_codewords < 0 { - return Err(ImplodeError::HigherCountThanAvailableCodewords); + return Err(Error::new( + io::ErrorKind::InvalidData, + "Huffman tree is not full", + )); } } if avail_codewords != 0 { // Not all codewords were used. - return Err(ImplodeError::NotAllCodewordsUsed); + return Err(Error::new( + io::ErrorKind::InvalidData, + "Not all codewords were used", + )); } let ok = d.init(&lens, num_lens); - assert!(ok, "The checks above mean the tree should be valid."); + debug_assert!(ok, "The checks above mean the tree should be valid."); Ok(()) } @@ -104,7 +92,7 @@ fn hwexplode( pk101_bug_compat: bool, src_used: &mut usize, dst: &mut VecDeque, -) -> core::result::Result<(), ImplodeError> { +) -> std::io::Result<()> { let mut is = BitStream::new(src, src_len); let mut lit_decoder = HuffmanDecoder::default(); let mut len_decoder = HuffmanDecoder::default(); @@ -137,22 +125,18 @@ fn hwexplode( let mut used = 0; if lit_tree { sym = lit_decoder.huffman_decode(!bits as u16, &mut used); - assert!(sym >= 0, "huffman decode failed"); - if !is.advance(1 + used) { - return Err(ImplodeError::EndOfStream); - } + debug_assert!(sym >= 0, "huffman decode failed"); + is.advance(1 + used)?; } else { sym = lsb(bits, 8) as i32; - if !is.advance(1 + 8) { - return Err(ImplodeError::EndOfStream); - } + is.advance(1 + 8)?; } - assert!(sym >= 0 && sym <= u8::MAX as i32); + debug_assert!(sym >= 0 && sym <= u8::MAX as i32); dst.push_back(sym as u8); continue; } // Backref. - assert!(lsb(bits, 1) == 0x0); + debug_assert!(lsb(bits, 1) == 0x0); let mut used_tot = 1; bits >>= 1; @@ -171,7 +155,7 @@ fn hwexplode( // Read the Huffman-encoded high dist bits. let mut used = 0; let sym = dist_decoder.huffman_decode(!bits as u16, &mut used); - assert!(sym >= 0, "huffman decode failed"); + debug_assert!(sym >= 0, "huffman decode failed"); used_tot += used; bits >>= used; dist |= (sym as usize) << if large_wnd { 7 } else { 6 }; @@ -179,7 +163,7 @@ fn hwexplode( // Read the Huffman-encoded len. let sym = len_decoder.huffman_decode(!bits as u16, &mut used); - assert!(sym >= 0, "huffman decode failed"); + debug_assert!(sym >= 0, "huffman decode failed"); used_tot += used; bits >>= used; let mut len = (sym + min_len) as usize; @@ -191,10 +175,8 @@ fn hwexplode( // bits >>= 8; } - assert!((used_tot as usize) <= ISTREAM_MIN_BITS); - if !is.advance(used_tot) { - return Err(ImplodeError::EndOfStream); - } + debug_assert!((used_tot as usize) <= ISTREAM_MIN_BITS); + is.advance(used_tot)?; // let len = len.min(uncomp_len - dst.len()); if len <= uncomp_len - dst.len() && dist <= dst.len() { @@ -255,7 +237,7 @@ impl Read for ImplodeDecoder { return Err(err.into()); } let mut src_used = 0; - if let Err(err) = hwexplode( + hwexplode( &compressed_bytes, compressed_bytes.len(), self.uncompressed_size as usize, @@ -264,9 +246,7 @@ impl Read for ImplodeDecoder { false, &mut src_used, &mut self.stream, - ) { - return Err(io::Error::new(io::ErrorKind::InvalidData, err.to_string())); - } + )?; } let bytes_read = self.stream.len().min(buf.len()); buf[..bytes_read].copy_from_slice(&self.stream.drain(..bytes_read).collect::>()); diff --git a/src/legacy/lz77.rs b/src/legacy/lz77.rs index e52cc1068..ae3134aa3 100644 --- a/src/legacy/lz77.rs +++ b/src/legacy/lz77.rs @@ -2,7 +2,7 @@ use std::collections::VecDeque; /// Output the (dist,len) back reference at dst_pos in dst. pub fn lz77_output_backref(dst: &mut VecDeque, dist: usize, len: usize) { - // assert!(dist <= dst_pos, "cannot reference before beginning of dst"); + // debug_assert!(dist <= dst_pos, "cannot reference before beginning of dst"); for _ in 0..len { dst.push_back(dst[dst.len() - dist]); diff --git a/src/legacy/reduce.rs b/src/legacy/reduce.rs index ba65fe91b..c3b547af3 100644 --- a/src/legacy/reduce.rs +++ b/src/legacy/reduce.rs @@ -1,40 +1,20 @@ use std::collections::VecDeque; use std::io::{self, copy, Read, Result}; -use thiserror::Error; - use crate::legacy::lz77::lz77_output_backref; use super::bitstream::{lsb, BitStream}; -#[derive(Error, Debug)] -enum ReduceError { - #[error("Invalid follower set")] - InvalidFollowerSet, - #[error("Error reading next byte")] - ErrorReadingNextByte, -} - /// Number of bits used to represent indices in a follower set of size n. fn follower_idx_bw(n: u8) -> u8 { - assert!(n <= 32); - - if n > 16 { - return 5; + debug_assert!(n <= 32); + if n == 0 { + return 0; } - if n > 8 { - return 4; - } - if n > 4 { - return 3; - } - if n > 2 { - return 2; - } - if n > 0 { + if n == 1 { return 1; } - return 0; + 5 - ((n - 1) << 3).leading_zeros() as u8 } #[derive(Default, Clone, Copy)] @@ -45,27 +25,26 @@ struct FollowerSet { } /// Read the follower sets from is into fsets. Returns true on success. -fn read_follower_sets(is: &mut BitStream, fsets: &mut [FollowerSet]) -> bool { - for i in (0..=255 as usize).rev() { +fn read_follower_sets(is: &mut BitStream, fsets: &mut [FollowerSet]) -> io::Result<()> { + for i in (0..=u8::MAX as usize).rev() { let n = lsb(is.bits(), 6) as u8; if n > 32 { - return false; - } - if !is.advance(6) { - return false; + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "Invalid follower set", + )); } + is.advance(6)?; fsets[i].size = n; fsets[i].idx_bw = follower_idx_bw(n); for j in 0..fsets[i].size as usize { fsets[i].followers[j] = is.bits() as u8; - if !is.advance(8) { - return false; - } + is.advance(8)?; } } - return true; + Ok(()) } /// Read the next byte from is, decoded based on prev_byte and the follower sets. @@ -76,47 +55,51 @@ fn read_next_byte( prev_byte: u8, fsets: &mut [FollowerSet], out_byte: &mut u8, -) -> bool { +) -> io::Result<()> { let bits = is.bits(); if fsets[prev_byte as usize].size == 0 { // No followers; read a literal byte. *out_byte = bits as u8; - return is.advance(8); + is.advance(8)?; + return Ok(()); } if lsb(bits, 1) == 1 { // Don't use the follower set; read a literal byte. *out_byte = (bits >> 1) as u8; - return is.advance(1 + 8); + is.advance(1 + 8)?; + return Ok(()); } // The bits represent the index of a follower byte. let idx_bw = fsets[prev_byte as usize].idx_bw; let follower_idx = lsb(bits >> 1, idx_bw) as usize; if follower_idx >= fsets[prev_byte as usize].size as usize { - return false; + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "Invalid follower index", + )); } *out_byte = fsets[prev_byte as usize].followers[follower_idx]; - return is.advance(1 + idx_bw); + is.advance(1 + idx_bw)?; + Ok(()) } fn max_len(comp_factor: u8) -> usize { let v_len_bits = (8 - comp_factor) as usize; - assert!(comp_factor >= 1 && comp_factor <= 4); + debug_assert!(comp_factor >= 1 && comp_factor <= 4); // Bits in V + extra len byte + implicit 3. - ((1 << v_len_bits) - 1) + 255 + 3 + ((1 << v_len_bits) - 1) + u8::MAX as usize + 3 } fn max_dist(comp_factor: u8) -> usize { + debug_assert!(comp_factor >= 1 && comp_factor <= 4); let v_dist_bits = comp_factor as usize; - - assert!(comp_factor >= 1 && comp_factor <= 4); - // Bits in V * 256 + W byte + implicit 1. */ - ((1 << v_dist_bits) - 1) * 256 + 255 + 1 + ((1 << v_dist_bits) - 1) * 256 + u8::MAX as usize + 1 } const DLE_BYTE: u8 = 144; @@ -128,14 +111,12 @@ fn hwexpand( comp_factor: u8, src_used: &mut usize, dst: &mut VecDeque, -) -> core::result::Result<(), ReduceError> { +) -> io::Result<()> { let mut fsets = [FollowerSet::default(); 256]; - assert!(comp_factor >= 1 && comp_factor <= 4); + debug_assert!(comp_factor >= 1 && comp_factor <= 4); let mut is = BitStream::new(src, src_len); - if !read_follower_sets(&mut is, &mut fsets) { - return Err(ReduceError::InvalidFollowerSet); - } + read_follower_sets(&mut is, &mut fsets)?; // Number of bits in V used for backref length. let v_len_bits = 8 - comp_factor; @@ -144,9 +125,7 @@ fn hwexpand( while dst.len() < uncomp_len { // Read a literal byte or DLE marker. - if !read_next_byte(&mut is, curr_byte, &mut fsets, &mut curr_byte) { - return Err(ReduceError::ErrorReadingNextByte); - } + read_next_byte(&mut is, curr_byte, &mut fsets, &mut curr_byte)?; if curr_byte != DLE_BYTE { // Output a literal byte. dst.push_back(curr_byte); @@ -154,9 +133,7 @@ fn hwexpand( } // Read the V byte which determines the length. - if !read_next_byte(&mut is, curr_byte, &mut fsets, &mut curr_byte) { - return Err(ReduceError::ErrorReadingNextByte); - } + read_next_byte(&mut is, curr_byte, &mut fsets, &mut curr_byte)?; if curr_byte == 0 { // Output a literal DLE byte. dst.push_back(DLE_BYTE); @@ -166,21 +143,17 @@ fn hwexpand( let mut len = lsb(v as u64, v_len_bits) as usize; if len == (1 << v_len_bits) - 1 { // Read an extra length byte. - if !read_next_byte(&mut is, curr_byte, &mut fsets, &mut curr_byte) { - return Err(ReduceError::ErrorReadingNextByte); - } + read_next_byte(&mut is, curr_byte, &mut fsets, &mut curr_byte)?; len += curr_byte as usize; } len += 3; // Read the W byte, which together with V gives the distance. - if !read_next_byte(&mut is, curr_byte, &mut fsets, &mut curr_byte) { - return Err(ReduceError::ErrorReadingNextByte); - } + read_next_byte(&mut is, curr_byte, &mut fsets, &mut curr_byte)?; let dist = ((v as usize) >> v_len_bits) * 256 + curr_byte as usize + 1; - assert!(len <= max_len(comp_factor)); - assert!(dist as usize <= max_dist(comp_factor)); + debug_assert!(len <= max_len(comp_factor)); + debug_assert!(dist as usize <= max_dist(comp_factor)); // Output the back reference. if len <= uncomp_len - dst.len() && dist as usize <= dst.len() { @@ -238,16 +211,14 @@ impl Read for ReduceDecoder { return Err(err.into()); } let mut src_used = 0; - if let Err(err) = hwexpand( + hwexpand( &compressed_bytes, compressed_bytes.len(), self.uncompressed_size as usize, self.comp_factor, &mut src_used, &mut self.stream, - ) { - return Err(io::Error::new(io::ErrorKind::InvalidData, err.to_string())); - } + )?; } let bytes_read = self.stream.len().min(buf.len()); buf[..bytes_read].copy_from_slice(&self.stream.drain(..bytes_read).collect::>()); @@ -259,6 +230,8 @@ impl Read for ReduceDecoder { mod tests { use std::collections::VecDeque; + use crate::legacy::reduce::follower_idx_bw; + use super::hwexpand; const HAMLET_2048: [u8; 1285] = [ @@ -485,4 +458,30 @@ mod tests { assert_eq!(dst[(1 << 11) + i], 0); } } + + fn orig_follower_idx_bw(n: u8) -> u8 { + if n > 16 { + return 5; + } + if n > 8 { + return 4; + } + if n > 4 { + return 3; + } + if n > 2 { + return 2; + } + if n > 0 { + return 1; + } + return 0; + } + + #[test] + fn test_follower_idx_bw() { + for i in 0..=32 { + assert_eq!(orig_follower_idx_bw(i), follower_idx_bw(i)); + } + } } diff --git a/src/legacy/shrink.rs b/src/legacy/shrink.rs index a70d1e4e3..21069c36a 100644 --- a/src/legacy/shrink.rs +++ b/src/legacy/shrink.rs @@ -1,7 +1,5 @@ use std::collections::VecDeque; -use std::io::{self, copy, Read}; - -use thiserror::Error; +use std::io::{self, copy, Error, Read}; use super::bitstream::{lsb, BitStream}; @@ -17,7 +15,7 @@ const PARTIAL_CLEAR: u64 = 2; // const HASH_BITS: usize = MAX_CODE_SIZE + 1; /* For a load factor of 0.5. */ // const HASHTAB_SIZE: usize = 1 << HASH_BITS; const UNKNOWN_LEN: u16 = u16::MAX; - +/* #[derive(Error, Debug)] enum ShrinkError { #[error("self-referential code")] @@ -31,7 +29,7 @@ enum ShrinkError { #[error("prev code no longer valid")] PrevCodeNoLongerValid, -} +}*/ struct CodeQueue { next_idx: usize, codes: [u16; MAX_CODE as usize - CONTROL_CODE + 1], @@ -127,35 +125,35 @@ fn read_code( codetab: &mut [Codetab], queue: &mut CodeQueue, next_code: &mut u16, -) -> bool { +) -> io::Result<()> { // assert(sizeof(code) * CHAR_BIT >= *code_size); let code = lsb(is.bits(), *code_size) as u16; - if !is.advance(*code_size) { - return false; - } + is.advance(*code_size)?; // Handle regular codes (the common case). if code != CONTROL_CODE as u16 { *next_code = code; - return true; + return Ok(()); } // Handle control codes. let control_code = lsb(is.bits(), *code_size); - if !is.advance(*code_size) { + if is.advance(*code_size).is_err() { *next_code = INVALID_CODE; - return true; + return Ok(()); } if control_code == INC_CODE_SIZE && *code_size < MAX_CODE_SIZE { (*code_size) += 1; - return read_code(is, code_size, codetab, queue, next_code); + read_code(is, code_size, codetab, queue, next_code)?; + return Ok(()); } if control_code == PARTIAL_CLEAR { unshrink_partial_clear(codetab, queue); - return read_code(is, code_size, codetab, queue, next_code); + read_code(is, code_size, codetab, queue, next_code)?; + return Ok(()); } *next_code = INVALID_CODE; - return true; + return Ok(()); } /// Output the string represented by a code into dst at dst_pos. Returns @@ -169,8 +167,8 @@ fn output_code( queue: &mut CodeQueue, first_byte: &mut u8, len: &mut usize, -) -> Result<(), ShrinkError> { - assert!(code <= MAX_CODE as u16 && code != CONTROL_CODE as u16); +) -> io::Result<()> { + debug_assert!(code <= MAX_CODE as u16 && code != CONTROL_CODE as u16); if code <= u8::MAX as u16 { // Output literal byte. *first_byte = code as u8; @@ -184,7 +182,7 @@ fn output_code( { // Reject invalid codes. Self-referential codes may exist in // the table but cannot be used. - return Err(ShrinkError::InvalidPrefixCode); + return Err(io::Error::new(io::ErrorKind::InvalidData, "Invalid code")); } if codetab[code as usize].len != UNKNOWN_LEN { @@ -202,15 +200,15 @@ fn output_code( // was invalid (due to partial clearing) when the code was inserted into // the table. The prefix can then become valid when it's added to the // table at a later point. - assert!(codetab[code as usize].len == UNKNOWN_LEN); + debug_assert!(codetab[code as usize].len == UNKNOWN_LEN); let prefix_code = codetab[code as usize].prefix_code; - assert!(prefix_code as usize > CONTROL_CODE); + debug_assert!(prefix_code as usize > CONTROL_CODE); if prefix_code == queue.next() { /* The prefix code hasn't been added yet, but we were just about to: the KwKwK case. Add the previous string extended with its first byte. */ - assert!(codetab[prev_code as usize].prefix_code != INVALID_CODE); + debug_assert!(codetab[prev_code as usize].prefix_code != INVALID_CODE); codetab[prefix_code as usize].prefix_code = prev_code; codetab[prefix_code as usize].ext_byte = *first_byte; codetab[prefix_code as usize].len = codetab[prev_code as usize].len + 1; @@ -218,7 +216,10 @@ fn output_code( dst.push_back(*first_byte); } else if codetab[prefix_code as usize].prefix_code == INVALID_CODE { // The prefix code is still invalid. - return Err(ShrinkError::InvalidPrefixCode); + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "Invalid prefix code", + )); } // Output the prefix string, then the extension byte. @@ -232,7 +233,7 @@ fn output_code( *first_byte = dst[ct.last_dst_pos]; // Update the code table now that the string has a length and pos. - assert!(prev_code != code); + debug_assert!(prev_code != code); codetab[code as usize].len = *len as u16; codetab[code as usize].last_dst_pos = last_dst_pos; @@ -245,7 +246,7 @@ fn hwunshrink( uncompressed_size: usize, src_used: &mut usize, dst: &mut VecDeque, -) -> Result<(), ShrinkError> { +) -> io::Result<()> { let mut codetab = Codetab::new(); let mut queue = CodeQueue::new(); let mut is = BitStream::new(src, src_len); @@ -253,19 +254,24 @@ fn hwunshrink( // Handle the first code separately since there is no previous code. let mut curr_code = 0; - if !read_code( + if read_code( &mut is, &mut code_size, &mut codetab, &mut queue, &mut curr_code, - ) { + ) + .is_err() + { *src_used = is.bytes_read(); return Ok(()); } - assert!(curr_code != CONTROL_CODE as u16); + debug_assert!(curr_code != CONTROL_CODE as u16); if curr_code > u8::MAX as u16 { - return Err(ShrinkError::FirstCodeNeedsToBeLiteral); /* The first code must be a literal. */ + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "the first code must be a literal", + )); } let mut first_byte = curr_code as u8; codetab[curr_code as usize].last_dst_pos = dst.len(); @@ -280,25 +286,27 @@ fn hwunshrink( &mut queue, &mut curr_code, ) + .is_ok() { println!("{}", dst.len()); if curr_code == INVALID_CODE { - return Err(ShrinkError::InvalidCode); + return Err(Error::new(io::ErrorKind::InvalidData, "Invalid code")); } let dst_pos = dst.len(); // Handle KwKwK: next code used before being added. if curr_code == queue.next() { if codetab[prev_code as usize].prefix_code == INVALID_CODE { - // The previous code is no longer valid. - return Err(ShrinkError::PrevCodeNoLongerValid); + return Err(Error::new( + io::ErrorKind::InvalidData, + "Previous code no longer valid", + )); } // Extend the previous code with its first byte. - assert!(curr_code != prev_code); + debug_assert!(curr_code != prev_code); codetab[curr_code as usize].prefix_code = prev_code; codetab[curr_code as usize].ext_byte = first_byte; codetab[curr_code as usize].len = codetab[prev_code as usize].len + 1; codetab[curr_code as usize].last_dst_pos = codetab[prev_code as usize].last_dst_pos; - // assert!(dst_pos < dst_cap); // dst.push_back(first_byte); } @@ -319,8 +327,8 @@ fn hwunshrink( // Verify that the output matches walking the prefixes. let mut c = curr_code; for i in 0..len { - assert!(codetab[c as usize].len as usize == len - i); - // assert!(codetab[c as usize].ext_byte == dst[dst_pos + len - i - 1]); + debug_assert!(codetab[c as usize].len as usize == len - i); + // debug_assert!(codetab[c as usize].ext_byte == dst[dst_pos + len - i - 1]); c = codetab[c as usize].prefix_code; } @@ -329,7 +337,7 @@ fn hwunshrink( // the first byte of the current code's string. let new_code = queue.remove_next(); if new_code != INVALID_CODE { - //assert!(codetab[prev_code as usize].last_dst_pos < dst_pos); + //debug_assert!(codetab[prev_code as usize].last_dst_pos < dst_pos); codetab[new_code as usize].prefix_code = prev_code; codetab[new_code as usize].ext_byte = first_byte; codetab[new_code as usize].len = codetab[prev_code as usize].len + 1; @@ -388,15 +396,13 @@ impl Read for ShrinkDecoder { return Err(err.into()); } let mut src_used = compressed_bytes.len(); - if let Err(err) = hwunshrink( + hwunshrink( &compressed_bytes, compressed_bytes.len(), self.uncompressed_size as usize, &mut src_used, &mut self.stream, - ) { - return Err(io::Error::new(io::ErrorKind::InvalidData, err.to_string())); - } + )?; } let bytes_read = self.stream.len().min(buf.len()); buf[..bytes_read].copy_from_slice(&self.stream.drain(..bytes_read).collect::>()); From bb8d030b8dfbf8f791ad728fab17048b9d00a2a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Mon, 13 May 2024 06:22:11 +0200 Subject: [PATCH 06/50] Update src/legacy/shrink.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Chris Hennick <4961925+Pr0methean@users.noreply.github.com> Signed-off-by: Mike Krüger --- src/legacy/shrink.rs | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/legacy/shrink.rs b/src/legacy/shrink.rs index 21069c36a..163cabc64 100644 --- a/src/legacy/shrink.rs +++ b/src/legacy/shrink.rs @@ -82,15 +82,15 @@ impl Default for Codetab { impl Codetab { pub fn new() -> [Self; MAX_CODE + 1] { - let mut codetab = [Codetab::default(); MAX_CODE + 1]; - // Codes for literal bytes. Set a phony prefix_code so they're valid. - for i in 0..=u8::MAX as usize { - codetab[i].prefix_code = i as u16; - codetab[i].ext_byte = i as u8; - codetab[i].len = 1; - } - - codetab + let mut codetab = 0..=u8::MAX + .map(|i| Codetab { + prefix_code: i as u16, + ext_byte: i, + len: 1, + }) + .collect::Vec<_>(); + codetab.resize(MAX_CODE + 1, Codetab::default()); + codetab.try_into().unwrap() } } fn unshrink_partial_clear(codetab: &mut [Codetab], queue: &mut CodeQueue) { From 905eeeaa6bda20b9fb83155d089acc559d36f55a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Mon, 13 May 2024 06:37:50 +0200 Subject: [PATCH 07/50] Use Option instead of C like invalid values. --- src/legacy/bitstream.rs | 16 +++--- src/legacy/huffman.rs | 6 +- src/legacy/shrink.rs | 124 +++++++++++++++++----------------------- 3 files changed, 64 insertions(+), 82 deletions(-) diff --git a/src/legacy/bitstream.rs b/src/legacy/bitstream.rs index b80a9715d..a5efa190a 100644 --- a/src/legacy/bitstream.rs +++ b/src/legacy/bitstream.rs @@ -8,7 +8,7 @@ pub fn lsb(x: u64, n: u8) -> u64 { /// Reverse the n least significant bits of x. /// The (16 - n) most significant bits of the result will be zero. -pub fn reverse16(x: u16, n: usize) -> u16 { +pub fn reverse_lsb(x: u16, n: usize) -> u16 { debug_assert!(n > 0); debug_assert!(n <= 16); return x.reverse_bits() >> (16 - n); @@ -94,18 +94,18 @@ pub const ISTREAM_MIN_BITS: usize = 64 - 7; #[cfg(test)] mod tests { - use crate::legacy::bitstream::{lsb, reverse16}; + use crate::legacy::bitstream::{lsb, reverse_lsb}; #[test] fn test_reverse16() { - assert_eq!(reverse16(0x0000, 1), 0x0); - assert_eq!(reverse16(0xffff, 1), 0x1); - assert_eq!(reverse16(0x0000, 16), 0x0); - assert_eq!(reverse16(0xffff, 16), 0xffff); + assert_eq!(reverse_lsb(0x0000, 1), 0x0); + assert_eq!(reverse_lsb(0xffff, 1), 0x1); + assert_eq!(reverse_lsb(0x0000, 16), 0x0); + assert_eq!(reverse_lsb(0xffff, 16), 0xffff); // 0001 0010 0011 0100 -> 0010 1100 0100 1000 - assert_eq!(reverse16(0x1234, 16), 0x2c48); + assert_eq!(reverse_lsb(0x1234, 16), 0x2c48); // 111 1111 0100 0001 -> 100 0001 0111 1111 - assert_eq!(reverse16(0x7f41, 15), 0x417f); + assert_eq!(reverse_lsb(0x7f41, 15), 0x417f); } /* #[test] diff --git a/src/legacy/huffman.rs b/src/legacy/huffman.rs index fe0bd69ab..793a57f87 100644 --- a/src/legacy/huffman.rs +++ b/src/legacy/huffman.rs @@ -1,4 +1,4 @@ -use crate::legacy::bitstream::reverse16; +use crate::legacy::bitstream::reverse_lsb; use super::bitstream::lsb; @@ -103,7 +103,7 @@ impl HuffmanDecoder { pub fn table_insert(&mut self, sym: usize, len: usize, codeword: u16) { debug_assert!(len <= HUFFMAN_LOOKUP_TABLE_BITS as usize); - let codeword = reverse16(codeword, len); // Make it LSB-first. + let codeword = reverse_lsb(codeword, len); // Make it LSB-first. let pad_len = HUFFMAN_LOOKUP_TABLE_BITS as usize - len; // Pad the pad_len upper bits with all bit combinations. @@ -133,7 +133,7 @@ impl HuffmanDecoder { } // Then do canonical decoding with the bits in MSB-first order. - let mut bits = reverse16(bits, MAX_HUFFMAN_BITS); + let mut bits = reverse_lsb(bits, MAX_HUFFMAN_BITS); for l in HUFFMAN_LOOKUP_TABLE_BITS as usize + 1..=MAX_HUFFMAN_BITS { if (bits as u32) < self.sentinel_bits[l] { bits >>= MAX_HUFFMAN_BITS - l; diff --git a/src/legacy/shrink.rs b/src/legacy/shrink.rs index 163cabc64..d41b1ccdf 100644 --- a/src/legacy/shrink.rs +++ b/src/legacy/shrink.rs @@ -7,7 +7,6 @@ const MIN_CODE_SIZE: u8 = 9; const MAX_CODE_SIZE: u8 = 13; const MAX_CODE: usize = (1 << MAX_CODE_SIZE) - 1; -const INVALID_CODE: u16 = u16::MAX; const CONTROL_CODE: usize = 256; const INC_CODE_SIZE: u64 = 1; const PARTIAL_CLEAR: u64 = 2; @@ -32,38 +31,38 @@ enum ShrinkError { }*/ struct CodeQueue { next_idx: usize, - codes: [u16; MAX_CODE as usize - CONTROL_CODE + 1], + codes: [Option; MAX_CODE as usize - CONTROL_CODE + 1], } impl CodeQueue { fn new() -> Self { - let mut codes = [0; MAX_CODE as usize - CONTROL_CODE + 1]; + let mut codes = [None; MAX_CODE as usize - CONTROL_CODE + 1]; for (i, code) in (CONTROL_CODE as u16 + 1..=MAX_CODE as u16).enumerate() { - codes[i] = code; + codes[i] = Some(code); } Self { next_idx: 0, codes } } // Return the next code in the queue, or INVALID_CODE if the queue is empty. - fn next(&self) -> u16 { + fn next(&self) -> Option { // assert(q->next_idx < sizeof(q->codes) / sizeof(q->codes[0])); self.codes[self.next_idx] } /// Return and remove the next code from the queue, or return INVALID_CODE if /// the queue is empty. - fn remove_next(&mut self) -> u16 { - let code = self.next(); - if code != INVALID_CODE { + fn remove_next(&mut self) -> Option { + let res = self.next(); + if res.is_some() { self.next_idx += 1; } - code + res } } -#[derive(Clone, Copy)] +#[derive(Clone, Debug, Copy)] struct Codetab { - prefix_code: u16, // INVALID_CODE means the entry is invalid. + prefix_code: Option, ext_byte: u8, len: u16, last_dst_pos: usize, @@ -72,7 +71,7 @@ struct Codetab { impl Default for Codetab { fn default() -> Self { Self { - prefix_code: INVALID_CODE, + prefix_code: None, ext_byte: 0, len: 0, last_dst_pos: 0, @@ -81,25 +80,27 @@ impl Default for Codetab { } impl Codetab { - pub fn new() -> [Self; MAX_CODE + 1] { - let mut codetab = 0..=u8::MAX + pub fn create_new() -> [Self; MAX_CODE + 1] { + let mut codetab = (0..=u8::MAX) .map(|i| Codetab { - prefix_code: i as u16, + prefix_code: Some(i as u16), ext_byte: i, len: 1, + last_dst_pos: 0, }) - .collect::Vec<_>(); + .collect::>(); codetab.resize(MAX_CODE + 1, Codetab::default()); codetab.try_into().unwrap() } } + fn unshrink_partial_clear(codetab: &mut [Codetab], queue: &mut CodeQueue) { let mut is_prefix = [false; MAX_CODE + 1]; // Scan for codes that have been used as a prefix. for i in CONTROL_CODE + 1..=MAX_CODE { - if codetab[i].prefix_code != INVALID_CODE { - is_prefix[codetab[i].prefix_code as usize] = true; + if let Some(prefix_code) = codetab[i].prefix_code { + is_prefix[prefix_code as usize] = true; } } @@ -107,12 +108,12 @@ fn unshrink_partial_clear(codetab: &mut [Codetab], queue: &mut CodeQueue) { let mut code_queue_size = 0; for i in CONTROL_CODE + 1..MAX_CODE { if !is_prefix[i] { - codetab[i].prefix_code = INVALID_CODE; - queue.codes[code_queue_size] = i as u16; + codetab[i].prefix_code = None; + queue.codes[code_queue_size] = Some(i as u16); code_queue_size += 1; } } - queue.codes[code_queue_size] = INVALID_CODE; // End-of-queue marker. + queue.codes[code_queue_size] = None; // End-of-queue marker. queue.next_idx = 0; } @@ -124,36 +125,30 @@ fn read_code( code_size: &mut u8, codetab: &mut [Codetab], queue: &mut CodeQueue, - next_code: &mut u16, -) -> io::Result<()> { +) -> io::Result> { // assert(sizeof(code) * CHAR_BIT >= *code_size); let code = lsb(is.bits(), *code_size) as u16; is.advance(*code_size)?; // Handle regular codes (the common case). if code != CONTROL_CODE as u16 { - *next_code = code; - return Ok(()); + return Ok(Some(code)); } // Handle control codes. let control_code = lsb(is.bits(), *code_size); if is.advance(*code_size).is_err() { - *next_code = INVALID_CODE; - return Ok(()); + return Ok(None); } if control_code == INC_CODE_SIZE && *code_size < MAX_CODE_SIZE { (*code_size) += 1; - read_code(is, code_size, codetab, queue, next_code)?; - return Ok(()); + return read_code(is, code_size, codetab, queue); } if control_code == PARTIAL_CLEAR { unshrink_partial_clear(codetab, queue); - read_code(is, code_size, codetab, queue, next_code)?; - return Ok(()); + return read_code(is, code_size, codetab, queue); } - *next_code = INVALID_CODE; - return Ok(()); + return Ok(None); } /// Output the string represented by a code into dst at dst_pos. Returns @@ -177,8 +172,8 @@ fn output_code( return Ok(()); } - if codetab[code as usize].prefix_code == INVALID_CODE - || codetab[code as usize].prefix_code == code + if codetab[code as usize].prefix_code.is_none() + || codetab[code as usize].prefix_code == Some(code) { // Reject invalid codes. Self-referential codes may exist in // the table but cannot be used. @@ -201,20 +196,20 @@ fn output_code( // the table. The prefix can then become valid when it's added to the // table at a later point. debug_assert!(codetab[code as usize].len == UNKNOWN_LEN); - let prefix_code = codetab[code as usize].prefix_code; + let prefix_code = codetab[code as usize].prefix_code.unwrap(); debug_assert!(prefix_code as usize > CONTROL_CODE); - if prefix_code == queue.next() { + if Some(prefix_code) == queue.next() { /* The prefix code hasn't been added yet, but we were just about to: the KwKwK case. Add the previous string extended with its first byte. */ - debug_assert!(codetab[prev_code as usize].prefix_code != INVALID_CODE); - codetab[prefix_code as usize].prefix_code = prev_code; + debug_assert!(codetab[prev_code as usize].prefix_code.is_some()); + codetab[prefix_code as usize].prefix_code = Some(prev_code); codetab[prefix_code as usize].ext_byte = *first_byte; codetab[prefix_code as usize].len = codetab[prev_code as usize].len + 1; codetab[prefix_code as usize].last_dst_pos = codetab[prev_code as usize].last_dst_pos; dst.push_back(*first_byte); - } else if codetab[prefix_code as usize].prefix_code == INVALID_CODE { + } else if codetab[prefix_code as usize].prefix_code.is_none() { // The prefix code is still invalid. return Err(io::Error::new( io::ErrorKind::InvalidData, @@ -247,25 +242,22 @@ fn hwunshrink( src_used: &mut usize, dst: &mut VecDeque, ) -> io::Result<()> { - let mut codetab = Codetab::new(); + let mut codetab = Codetab::create_new(); let mut queue = CodeQueue::new(); let mut is = BitStream::new(src, src_len); let mut code_size = MIN_CODE_SIZE; // Handle the first code separately since there is no previous code. - let mut curr_code = 0; - if read_code( + let Ok(Some(curr_code)) = read_code( &mut is, &mut code_size, &mut codetab, &mut queue, - &mut curr_code, - ) - .is_err() - { + ) else { *src_used = is.bytes_read(); return Ok(()); - } + }; + debug_assert!(curr_code != CONTROL_CODE as u16); if curr_code > u8::MAX as u16 { return Err(io::Error::new( @@ -278,24 +270,22 @@ fn hwunshrink( dst.push_back(curr_code as u8); let mut prev_code = curr_code; - while dst.len() < uncompressed_size - && read_code( + while dst.len() < uncompressed_size { + let Ok(curr_code) = read_code( &mut is, &mut code_size, &mut codetab, &mut queue, - &mut curr_code, - ) - .is_ok() - { - println!("{}", dst.len()); - if curr_code == INVALID_CODE { + ) else { break; }; + + let Some(curr_code) = curr_code else { return Err(Error::new(io::ErrorKind::InvalidData, "Invalid code")); - } + }; + let dst_pos = dst.len(); // Handle KwKwK: next code used before being added. - if curr_code == queue.next() { - if codetab[prev_code as usize].prefix_code == INVALID_CODE { + if Some(curr_code) == queue.next() { + if codetab[prev_code as usize].prefix_code.is_none() { return Err(Error::new( io::ErrorKind::InvalidData, "Previous code no longer valid", @@ -303,7 +293,7 @@ fn hwunshrink( } // Extend the previous code with its first byte. debug_assert!(curr_code != prev_code); - codetab[curr_code as usize].prefix_code = prev_code; + codetab[curr_code as usize].prefix_code = Some(prev_code); codetab[curr_code as usize].ext_byte = first_byte; codetab[curr_code as usize].len = codetab[prev_code as usize].len + 1; codetab[curr_code as usize].last_dst_pos = codetab[prev_code as usize].last_dst_pos; @@ -324,26 +314,18 @@ fn hwunshrink( return Err(s); } - // Verify that the output matches walking the prefixes. - let mut c = curr_code; - for i in 0..len { - debug_assert!(codetab[c as usize].len as usize == len - i); - // debug_assert!(codetab[c as usize].ext_byte == dst[dst_pos + len - i - 1]); - c = codetab[c as usize].prefix_code; - } - // Add a new code to the string table if there's room. // The string is the previous code's string extended with // the first byte of the current code's string. let new_code = queue.remove_next(); - if new_code != INVALID_CODE { + if let Some(new_code) = new_code { //debug_assert!(codetab[prev_code as usize].last_dst_pos < dst_pos); - codetab[new_code as usize].prefix_code = prev_code; + codetab[new_code as usize].prefix_code = Some(prev_code); codetab[new_code as usize].ext_byte = first_byte; codetab[new_code as usize].len = codetab[prev_code as usize].len + 1; codetab[new_code as usize].last_dst_pos = codetab[prev_code as usize].last_dst_pos; - if codetab[prev_code as usize].prefix_code == INVALID_CODE { + if codetab[prev_code as usize].prefix_code.is_none() { // prev_code was invalidated in a partial // clearing. Until that code is re-used, the // string represented by new_code is From 45d79f627a16a5393502a5cc7cacd3bc491d7124 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Mon, 13 May 2024 07:04:30 +0200 Subject: [PATCH 08/50] Removed comments. --- src/legacy/bitstream.rs | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/src/legacy/bitstream.rs b/src/legacy/bitstream.rs index a5efa190a..49dc32500 100644 --- a/src/legacy/bitstream.rs +++ b/src/legacy/bitstream.rs @@ -14,12 +14,6 @@ pub fn reverse_lsb(x: u16, n: usize) -> u16 { return x.reverse_bits() >> (16 - n); } -/* -pub fn round_up(x: usize, m: usize) -> usize { - debug_assert!((m & (m - 1)) == 0, "m must be a power of two"); - (x + m - 1) & (-(m as i64)) as usize // Hacker's Delight (2nd), 3-1. -} -*/ /// Input bitstream. pub struct BitStream<'a> { src: &'a [u8], /* Source bytes. */ @@ -107,16 +101,6 @@ mod tests { // 111 1111 0100 0001 -> 100 0001 0111 1111 assert_eq!(reverse_lsb(0x7f41, 15), 0x417f); } - /* - #[test] - fn test_bits_round_up() { - assert_eq!(round_up(0, 4), 0); - assert_eq!(round_up(1, 4), 4); - assert_eq!(round_up(2, 4), 4); - assert_eq!(round_up(3, 4), 4); - assert_eq!(round_up(4, 4), 4); - assert_eq!(round_up(5, 4), 8); - }*/ #[test] fn test_bits_test_bits_lsbround_up() { From db40e5b92df36b29133df2c61bd532b46f3ee3e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Mon, 13 May 2024 07:18:36 +0200 Subject: [PATCH 09/50] huffman_decode now returns a Result. --- src/legacy/bitstream.rs | 2 +- src/legacy/huffman.rs | 23 ++++++++++++++--------- src/legacy/implode.rs | 13 +++++-------- src/legacy/shrink.rs | 20 ++++++-------------- 4 files changed, 26 insertions(+), 32 deletions(-) diff --git a/src/legacy/bitstream.rs b/src/legacy/bitstream.rs index 49dc32500..9662e420f 100644 --- a/src/legacy/bitstream.rs +++ b/src/legacy/bitstream.rs @@ -46,7 +46,7 @@ impl<'a> BitStream<'a> { // Read the available bytes and zero-pad. let mut bits = 0; for i in 0..self.src.len() - next { - bits |= (self.src[next + i] as u64).wrapping_shl(i as u32 * 8); + bits |= (self.src[next + i] as u64) << (i as u32 * 8); } bits }; diff --git a/src/legacy/huffman.rs b/src/legacy/huffman.rs index 793a57f87..4708d1e17 100644 --- a/src/legacy/huffman.rs +++ b/src/legacy/huffman.rs @@ -1,3 +1,5 @@ +use std::io::{self, Error}; + use crate::legacy::bitstream::reverse_lsb; use super::bitstream::lsb; @@ -120,7 +122,7 @@ impl HuffmanDecoder { /// Returns the decoded symbol number or -1 if no symbol could be decoded. /// *num_used_bits will be set to the number of bits used to decode the symbol, /// or zero if no symbol could be decoded. - pub fn huffman_decode(&mut self, bits: u16, num_used_bits: &mut u8) -> i32 { + pub fn huffman_decode(&mut self, bits: u16, num_used_bits: &mut u8) -> std::io::Result { // First try the lookup table. let lookup_bits = lsb(bits as u64, HUFFMAN_LOOKUP_TABLE_BITS) as usize; debug_assert!(lookup_bits < self.table.len()); @@ -129,7 +131,7 @@ impl HuffmanDecoder { debug_assert!(self.table[lookup_bits].len <= HUFFMAN_LOOKUP_TABLE_BITS); // debug_assert!(self.table[lookup_bits].sym < self.num_syms); *num_used_bits = self.table[lookup_bits].len; - return self.table[lookup_bits].sym as i32; + return Ok(self.table[lookup_bits].sym); } // Then do canonical decoding with the bits in MSB-first order. @@ -142,11 +144,14 @@ impl HuffmanDecoder { //assert(sym_idx < self.num_syms); *num_used_bits = l as u8; - return self.syms[sym_idx] as i32; + return Ok(self.syms[sym_idx]); } } *num_used_bits = 0; - -1 + Err(Error::new( + io::ErrorKind::InvalidData, + "huffman decode failed", + )) } } @@ -183,23 +188,23 @@ mod tests { let mut used = 0; // 000 (msb-first) -> 000 (lsb-first) - assert_eq!(d.huffman_decode(0x0, &mut used), 0); + assert_eq!(d.huffman_decode(0x0, &mut used).unwrap(), 0); assert_eq!(used, 3); /* 011 (msb-first) -> 110 (lsb-first)*/ - assert_eq!(d.huffman_decode(0x6, &mut used), 3); + assert_eq!(d.huffman_decode(0x6, &mut used).unwrap(), 3); assert_eq!(used, 3); /* 11110 (msb-first) -> 01111 (lsb-first)*/ - assert_eq!(d.huffman_decode(0x0f, &mut used), 17); + assert_eq!(d.huffman_decode(0x0f, &mut used).unwrap(), 17); assert_eq!(used, 5); /* 111110 (msb-first) -> 011111 (lsb-first)*/ - assert_eq!(d.huffman_decode(0x1f, &mut used), 16); + assert_eq!(d.huffman_decode(0x1f, &mut used).unwrap(), 16); assert_eq!(used, 6); /* 1111111 (msb-first) -> 1111111 (lsb-first)*/ - assert_eq!(d.huffman_decode(0x7f, &mut used), -1); + assert!(d.huffman_decode(0x7f, &mut used).is_err()); /* Make sure used is set even when decoding fails. */ assert_eq!(used, 0); diff --git a/src/legacy/implode.rs b/src/legacy/implode.rs index 7e36aab49..2dd644612 100644 --- a/src/legacy/implode.rs +++ b/src/legacy/implode.rs @@ -124,14 +124,13 @@ fn hwexplode( let sym; let mut used = 0; if lit_tree { - sym = lit_decoder.huffman_decode(!bits as u16, &mut used); - debug_assert!(sym >= 0, "huffman decode failed"); + sym = lit_decoder.huffman_decode(!bits as u16, &mut used)?; is.advance(1 + used)?; } else { - sym = lsb(bits, 8) as i32; + sym = lsb(bits, 8) as u16; is.advance(1 + 8)?; } - debug_assert!(sym >= 0 && sym <= u8::MAX as i32); + debug_assert!(sym <= u8::MAX as u16); dst.push_back(sym as u8); continue; } @@ -154,16 +153,14 @@ fn hwexplode( // Read the Huffman-encoded high dist bits. let mut used = 0; - let sym = dist_decoder.huffman_decode(!bits as u16, &mut used); - debug_assert!(sym >= 0, "huffman decode failed"); + let sym = dist_decoder.huffman_decode(!bits as u16, &mut used)?; used_tot += used; bits >>= used; dist |= (sym as usize) << if large_wnd { 7 } else { 6 }; dist += 1; // Read the Huffman-encoded len. - let sym = len_decoder.huffman_decode(!bits as u16, &mut used); - debug_assert!(sym >= 0, "huffman decode failed"); + let sym = len_decoder.huffman_decode(!bits as u16, &mut used)?; used_tot += used; bits >>= used; let mut len = (sym + min_len) as usize; diff --git a/src/legacy/shrink.rs b/src/legacy/shrink.rs index d41b1ccdf..6618ed6df 100644 --- a/src/legacy/shrink.rs +++ b/src/legacy/shrink.rs @@ -248,12 +248,7 @@ fn hwunshrink( let mut code_size = MIN_CODE_SIZE; // Handle the first code separately since there is no previous code. - let Ok(Some(curr_code)) = read_code( - &mut is, - &mut code_size, - &mut codetab, - &mut queue, - ) else { + let Ok(Some(curr_code)) = read_code(&mut is, &mut code_size, &mut codetab, &mut queue) else { *src_used = is.bytes_read(); return Ok(()); }; @@ -271,14 +266,11 @@ fn hwunshrink( let mut prev_code = curr_code; while dst.len() < uncompressed_size { - let Ok(curr_code) = read_code( - &mut is, - &mut code_size, - &mut codetab, - &mut queue, - ) else { break; }; - - let Some(curr_code) = curr_code else { + let Ok(curr_code) = read_code(&mut is, &mut code_size, &mut codetab, &mut queue) else { + break; + }; + + let Some(curr_code) = curr_code else { return Err(Error::new(io::ErrorKind::InvalidData, "Invalid code")); }; From 4189412db89465fc0a6d0b23f4d75f5ba97e3e73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Mon, 13 May 2024 07:39:14 +0200 Subject: [PATCH 10/50] Removed legacy-zip feature. --- Cargo.toml | 2 -- src/compression.rs | 18 ------------------ src/lib.rs | 1 - src/read.rs | 14 -------------- src/write.rs | 3 --- 5 files changed, 38 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 9b8869909..cf36d0041 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -72,7 +72,6 @@ deflate-zlib = ["flate2/zlib", "_deflate-any"] deflate-zlib-ng = ["flate2/zlib-ng", "_deflate-any"] deflate-zopfli = ["zopfli", "_deflate-any"] lzma = ["lzma-rs/stream"] -legacy-zip = [] unreserved = [] default = [ "aes-crypto", @@ -84,7 +83,6 @@ default = [ "lzma", "time", "zstd", - "legacy-zip", ] [[bench]] diff --git a/src/compression.rs b/src/compression.rs index dd2ba25d1..e32dd1851 100644 --- a/src/compression.rs +++ b/src/compression.rs @@ -40,13 +40,10 @@ pub enum CompressionMethod { Lzma, /// Legacy format - #[cfg(feature = "legacy-zip")] Shrink, /// Reduce (Method 2-5) - #[cfg(feature = "legacy-zip")] Reduce(u8), /// Method 6 Implode/explode - #[cfg(feature = "legacy-zip")] Implode, /// Unsupported compression method #[cfg_attr( @@ -59,17 +56,11 @@ pub enum CompressionMethod { /// All compression methods defined for the ZIP format impl CompressionMethod { pub const STORE: Self = CompressionMethod::Stored; - #[cfg(feature = "legacy-zip")] pub const SHRINK: Self = CompressionMethod::Shrink; - #[cfg(feature = "legacy-zip")] pub const REDUCE_1: Self = CompressionMethod::Unsupported(2); - #[cfg(feature = "legacy-zip")] pub const REDUCE_2: Self = CompressionMethod::Unsupported(3); - #[cfg(feature = "legacy-zip")] pub const REDUCE_3: Self = CompressionMethod::Unsupported(4); - #[cfg(feature = "legacy-zip")] pub const REDUCE_4: Self = CompressionMethod::Unsupported(5); - #[cfg(feature = "legacy-zip")] pub const IMPLODE: Self = CompressionMethod::Unsupported(6); #[cfg(feature = "_deflate-any")] pub const DEFLATE: Self = CompressionMethod::Deflated; @@ -115,17 +106,11 @@ impl CompressionMethod { #[allow(deprecated)] match val { 0 => CompressionMethod::Stored, - #[cfg(feature = "legacy-zip")] 1 => CompressionMethod::Shrink, - #[cfg(feature = "legacy-zip")] 2 => CompressionMethod::Reduce(1), - #[cfg(feature = "legacy-zip")] 3 => CompressionMethod::Reduce(2), - #[cfg(feature = "legacy-zip")] 4 => CompressionMethod::Reduce(3), - #[cfg(feature = "legacy-zip")] 5 => CompressionMethod::Reduce(4), - #[cfg(feature = "legacy-zip")] 6 => CompressionMethod::Implode, #[cfg(feature = "_deflate-any")] 8 => CompressionMethod::Deflated, @@ -153,11 +138,8 @@ impl CompressionMethod { #[allow(deprecated)] match self { CompressionMethod::Stored => 0, - #[cfg(feature = "legacy-zip")] CompressionMethod::Shrink => 1, - #[cfg(feature = "legacy-zip")] CompressionMethod::Reduce(n) => 1 + n as u16, - #[cfg(feature = "legacy-zip")] CompressionMethod::Implode => 6, #[cfg(feature = "_deflate-any")] diff --git a/src/lib.rs b/src/lib.rs index f693aa4cc..b3f209e70 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -47,7 +47,6 @@ mod types; pub mod write; mod zipcrypto; pub use extra_fields::ExtraField; -#[cfg(feature = "legacy-zip")] mod legacy; #[doc = "Unstable APIs\n\ diff --git a/src/read.rs b/src/read.rs index 56855dfe7..c6d00c461 100644 --- a/src/read.rs +++ b/src/read.rs @@ -6,9 +6,7 @@ use crate::compression::CompressionMethod; use crate::cp437::FromCp437; use crate::crc32::Crc32Reader; use crate::extra_fields::{ExtendedTimestamp, ExtraField}; -#[cfg(feature = "legacy-zip")] use crate::legacy::ShrinkDecoder; -#[cfg(feature = "legacy-zip")] use crate::legacy::{ImplodeDecoder, ReduceDecoder}; use crate::read::zip_archive::Shared; use crate::result::{ZipError, ZipResult}; @@ -146,11 +144,8 @@ pub(crate) enum ZipFileReader<'a> { NoReader, Raw(io::Take<&'a mut dyn Read>), Stored(Crc32Reader>), - #[cfg(feature = "legacy-zip")] Shrink(Crc32Reader>>), - #[cfg(feature = "legacy-zip")] Reduce(Crc32Reader>>), - #[cfg(feature = "legacy-zip")] Implode(Crc32Reader>>), #[cfg(feature = "_deflate-any")] Deflated(Crc32Reader>>), @@ -170,11 +165,8 @@ impl<'a> Read for ZipFileReader<'a> { ZipFileReader::NoReader => panic!("ZipFileReader was in an invalid state"), ZipFileReader::Raw(r) => r.read(buf), ZipFileReader::Stored(r) => r.read(buf), - #[cfg(feature = "legacy-zip")] ZipFileReader::Shrink(r) => r.read(buf), - #[cfg(feature = "legacy-zip")] ZipFileReader::Reduce(r) => r.read(buf), - #[cfg(feature = "legacy-zip")] ZipFileReader::Implode(r) => r.read(buf), #[cfg(feature = "_deflate-any")] ZipFileReader::Deflated(r) => r.read(buf), @@ -197,7 +189,6 @@ impl<'a> ZipFileReader<'a> { ZipFileReader::NoReader => panic!("ZipFileReader was in an invalid state"), ZipFileReader::Raw(r) => r, ZipFileReader::Stored(r) => r.into_inner().into_inner(), - #[cfg(feature = "legacy-zip")] ZipFileReader::Shrink(r) => { // Lzma reader owns its buffer rather than mutably borrowing it, so we have to drop // it separately @@ -206,7 +197,6 @@ impl<'a> ZipFileReader<'a> { } return; } - #[cfg(feature = "legacy-zip")] ZipFileReader::Reduce(r) => { // Lzma reader owns its buffer rather than mutably borrowing it, so we have to drop // it separately @@ -215,7 +205,6 @@ impl<'a> ZipFileReader<'a> { } return; } - #[cfg(feature = "legacy-zip")] ZipFileReader::Implode(r) => { // Lzma reader owns its buffer rather than mutably borrowing it, so we have to drop // it separately @@ -343,7 +332,6 @@ pub(crate) fn make_reader( crc32, ae2_encrypted, ))), - #[cfg(feature = "legacy-zip")] CompressionMethod::Shrink => { let reader = ShrinkDecoder::new(reader, uncompressed_size); Ok(ZipFileReader::Shrink(Crc32Reader::new( @@ -352,7 +340,6 @@ pub(crate) fn make_reader( ae2_encrypted, ))) } - #[cfg(feature = "legacy-zip")] CompressionMethod::Reduce(comp_factor) => { let reader = ReduceDecoder::new(reader, uncompressed_size, comp_factor); Ok(ZipFileReader::Reduce(Crc32Reader::new( @@ -361,7 +348,6 @@ pub(crate) fn make_reader( ae2_encrypted, ))) } - #[cfg(feature = "legacy-zip")] CompressionMethod::Implode => { let reader = ImplodeDecoder::new(reader, uncompressed_size, flags); Ok(ZipFileReader::Implode(Crc32Reader::new( diff --git a/src/write.rs b/src/write.rs index a80bccf5b..ff4568489 100644 --- a/src/write.rs +++ b/src/write.rs @@ -1515,15 +1515,12 @@ impl GenericZipWriter { Ok(Box::new(|bare| Storer(bare))) } } - #[cfg(feature = "legacy-zip")] CompressionMethod::Shrink => Err(ZipError::UnsupportedArchive( "Shrink compression unsupported", )), - #[cfg(feature = "legacy-zip")] CompressionMethod::Reduce(_) => Err(ZipError::UnsupportedArchive( "Reduce compression unsupported", )), - #[cfg(feature = "legacy-zip")] CompressionMethod::Implode => Err(ZipError::UnsupportedArchive( "Implode compression unsupported", )), From fafdaf14846fb49026adfaf1cbe687c6ab2d897b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Mon, 13 May 2024 10:30:33 +0200 Subject: [PATCH 11/50] Refactored huffman implementation a bit. --- src/legacy/huffman.rs | 85 ++++++++++++++++++++++--------------------- src/legacy/implode.rs | 4 +- 2 files changed, 45 insertions(+), 44 deletions(-) diff --git a/src/legacy/huffman.rs b/src/legacy/huffman.rs index 4708d1e17..e94a9f098 100644 --- a/src/legacy/huffman.rs +++ b/src/legacy/huffman.rs @@ -23,9 +23,9 @@ pub struct HuffmanDecoder { /// Lookup table for fast decoding of short codewords. pub table: [TableEntry; 1 << HUFFMAN_LOOKUP_TABLE_BITS], /// "Sentinel bits" value for each codeword length. - pub sentinel_bits: [u32; MAX_HUFFMAN_BITS + 1], + pub sentinel_bits: [u32; MAX_HUFFMAN_BITS], /// First symbol index minus first codeword mod 2**16 for each length. - pub offset_first_sym_idx: [u16; MAX_HUFFMAN_BITS + 1], + pub offset_first_sym_idx: [u16; MAX_HUFFMAN_BITS], /// Map from symbol index to symbol. pub syms: [u16; MAX_HUFFMAN_SYMBOLS], // num_syms:usize @@ -48,62 +48,65 @@ impl Default for HuffmanDecoder { /// Returns false if the codeword lengths do not correspond to a valid prefix /// code. impl HuffmanDecoder { - pub fn init(&mut self, lengths: &[u8], n: usize) -> bool { - let mut count = [0; MAX_HUFFMAN_BITS + 1]; - let mut code = [0; MAX_HUFFMAN_BITS + 1]; - let mut sym_idx = [0; MAX_HUFFMAN_BITS + 1]; + pub fn init(&mut self, lengths: &[u8], n: usize) -> std::io::Result<()> { + let mut count = [0; MAX_HUFFMAN_BITS]; + let mut code = [0; MAX_HUFFMAN_BITS]; + let mut sym_idx: [u16; 16] = [0; MAX_HUFFMAN_BITS]; // Zero-initialize the lookup table. for t in &mut self.table { t.len = 0; } // Count the number of codewords of each length. - for i in 0..n { - debug_assert!(lengths[i] as usize <= MAX_HUFFMAN_BITS); - count[lengths[i] as usize] += 1; + for sym in 0..n { + let len = lengths[sym] as usize; + // Ignore zero-length codewords. + if len == 0 { + continue; + } + debug_assert!(len < MAX_HUFFMAN_BITS); + count[len] += 1; } - count[0] = 0; // Ignore zero-length codewords. - // Compute sentinel_bits and offset_first_sym_idx for each length. - code[0] = 0; - sym_idx[0] = 0; - for l in 1..=MAX_HUFFMAN_BITS { + + for len in 1..MAX_HUFFMAN_BITS { // First canonical codeword of this length. - code[l] = ((code[l - 1] + count[l - 1]) << 1) as u16; + code[len] = (code[len - 1] + count[len - 1]) << 1; - if count[l] != 0 && code[l] as u32 + count[l] as u32 - 1 > (1u32 << l) - 1 { - // The last codeword is longer than l bits. - return false; + if count[len] != 0 && code[len] as u32 + count[len] as u32 - 1 > (1u32 << len) - 1 { + return Err(Error::new( + io::ErrorKind::InvalidData, + "The last codeword is longer than len bits", + )); } - let s = ((code[l] as u32 + count[l] as u32) << (MAX_HUFFMAN_BITS - l)) as u32; - self.sentinel_bits[l] = s; - debug_assert!(self.sentinel_bits[l] >= code[l] as u32, "No overflow!"); - - sym_idx[l] = sym_idx[l - 1] + count[l - 1]; - self.offset_first_sym_idx[l] = sym_idx[l].wrapping_sub(code[l]); + let s = ((code[len] as u32 + count[len] as u32) << (MAX_HUFFMAN_BITS - len)) as u32; + self.sentinel_bits[len] = s; + debug_assert!(self.sentinel_bits[len] >= code[len] as u32, "No overflow!"); + sym_idx[len] = sym_idx[len - 1] + count[len - 1]; + self.offset_first_sym_idx[len] = sym_idx[len].wrapping_sub(code[len]); } // Build mapping from index to symbol and populate the lookup table. - for i in 0..n { - let l = lengths[i] as usize; - if l == 0 { + for sym in 0..n { + let len = lengths[sym] as usize; + if len == 0 { continue; } - self.syms[sym_idx[l] as usize] = i as u16; - sym_idx[l] += 1; + self.syms[sym_idx[len] as usize] = sym as u16; + sym_idx[len] += 1; - if l <= HUFFMAN_LOOKUP_TABLE_BITS as usize { - self.table_insert(i, l, code[l]); - code[l] += 1; + if len < HUFFMAN_LOOKUP_TABLE_BITS as usize { + self.table_insert(sym, len, code[len]); + code[len] += 1; } } - true + Ok(()) } pub fn table_insert(&mut self, sym: usize, len: usize, codeword: u16) { - debug_assert!(len <= HUFFMAN_LOOKUP_TABLE_BITS as usize); + debug_assert!(len < HUFFMAN_LOOKUP_TABLE_BITS as usize); let codeword = reverse_lsb(codeword, len); // Make it LSB-first. let pad_len = HUFFMAN_LOOKUP_TABLE_BITS as usize - len; @@ -128,7 +131,7 @@ impl HuffmanDecoder { debug_assert!(lookup_bits < self.table.len()); if self.table[lookup_bits].len != 0 { - debug_assert!(self.table[lookup_bits].len <= HUFFMAN_LOOKUP_TABLE_BITS); + debug_assert!(self.table[lookup_bits].len < HUFFMAN_LOOKUP_TABLE_BITS); // debug_assert!(self.table[lookup_bits].sym < self.num_syms); *num_used_bits = self.table[lookup_bits].len; return Ok(self.table[lookup_bits].sym); @@ -136,15 +139,15 @@ impl HuffmanDecoder { // Then do canonical decoding with the bits in MSB-first order. let mut bits = reverse_lsb(bits, MAX_HUFFMAN_BITS); - for l in HUFFMAN_LOOKUP_TABLE_BITS as usize + 1..=MAX_HUFFMAN_BITS { - if (bits as u32) < self.sentinel_bits[l] { + for l in HUFFMAN_LOOKUP_TABLE_BITS as usize + 1..MAX_HUFFMAN_BITS { + if self.sentinel_bits[l] > bits as u32 { bits >>= MAX_HUFFMAN_BITS - l; - let sym_idx = (self.offset_first_sym_idx[l] as usize + bits as usize) & 0xFFFF; - //assert(sym_idx < self.num_syms); + let sym_idx = self.offset_first_sym_idx[l] + bits; + // debug_assert(sym_idx < self.num_syms); *num_used_bits = l as u8; - return Ok(self.syms[sym_idx]); + return Ok(self.syms[sym_idx as usize]); } } *num_used_bits = 0; @@ -184,7 +187,7 @@ mod tests { ]; let mut d = HuffmanDecoder::default(); - assert!(d.init(&lens, lens.len())); + d.init(&lens, lens.len()).unwrap(); let mut used = 0; // 000 (msb-first) -> 000 (lsb-first) diff --git a/src/legacy/implode.rs b/src/legacy/implode.rs index 2dd644612..bc30279dc 100644 --- a/src/legacy/implode.rs +++ b/src/legacy/implode.rs @@ -78,9 +78,7 @@ fn read_huffman_code( )); } - let ok = d.init(&lens, num_lens); - debug_assert!(ok, "The checks above mean the tree should be valid."); - Ok(()) + d.init(&lens, num_lens) } fn hwexplode( From 45f0f66d4d4ce7d63046762f289dd4df596b19ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Mon, 13 May 2024 10:53:01 +0200 Subject: [PATCH 12/50] read_next_byte no longer uses out param. --- src/legacy/reduce.rs | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/src/legacy/reduce.rs b/src/legacy/reduce.rs index c3b547af3..94acf3328 100644 --- a/src/legacy/reduce.rs +++ b/src/legacy/reduce.rs @@ -50,26 +50,19 @@ fn read_follower_sets(is: &mut BitStream, fsets: &mut [FollowerSet]) -> io::Resu /// Read the next byte from is, decoded based on prev_byte and the follower sets. /// The byte is returned in *out_byte. The function returns true on success, /// and false on bad data or end of input. -fn read_next_byte( - is: &mut BitStream, - prev_byte: u8, - fsets: &mut [FollowerSet], - out_byte: &mut u8, -) -> io::Result<()> { +fn read_next_byte(is: &mut BitStream, prev_byte: u8, fsets: &mut [FollowerSet]) -> io::Result { let bits = is.bits(); if fsets[prev_byte as usize].size == 0 { // No followers; read a literal byte. - *out_byte = bits as u8; is.advance(8)?; - return Ok(()); + return Ok(bits as u8); } if lsb(bits, 1) == 1 { // Don't use the follower set; read a literal byte. - *out_byte = (bits >> 1) as u8; is.advance(1 + 8)?; - return Ok(()); + return Ok((bits >> 1) as u8); } // The bits represent the index of a follower byte. @@ -81,9 +74,8 @@ fn read_next_byte( "Invalid follower index", )); } - *out_byte = fsets[prev_byte as usize].followers[follower_idx]; is.advance(1 + idx_bw)?; - Ok(()) + Ok(fsets[prev_byte as usize].followers[follower_idx]) } fn max_len(comp_factor: u8) -> usize { @@ -125,7 +117,7 @@ fn hwexpand( while dst.len() < uncomp_len { // Read a literal byte or DLE marker. - read_next_byte(&mut is, curr_byte, &mut fsets, &mut curr_byte)?; + curr_byte = read_next_byte(&mut is, curr_byte, &mut fsets)?; if curr_byte != DLE_BYTE { // Output a literal byte. dst.push_back(curr_byte); @@ -133,7 +125,7 @@ fn hwexpand( } // Read the V byte which determines the length. - read_next_byte(&mut is, curr_byte, &mut fsets, &mut curr_byte)?; + curr_byte = read_next_byte(&mut is, curr_byte, &mut fsets)?; if curr_byte == 0 { // Output a literal DLE byte. dst.push_back(DLE_BYTE); @@ -143,13 +135,13 @@ fn hwexpand( let mut len = lsb(v as u64, v_len_bits) as usize; if len == (1 << v_len_bits) - 1 { // Read an extra length byte. - read_next_byte(&mut is, curr_byte, &mut fsets, &mut curr_byte)?; + curr_byte = read_next_byte(&mut is, curr_byte, &mut fsets)?; len += curr_byte as usize; } len += 3; // Read the W byte, which together with V gives the distance. - read_next_byte(&mut is, curr_byte, &mut fsets, &mut curr_byte)?; + curr_byte = read_next_byte(&mut is, curr_byte, &mut fsets)?; let dist = ((v as usize) >> v_len_bits) * 256 + curr_byte as usize + 1; debug_assert!(len <= max_len(comp_factor)); From 68f31428fcc1909a415ee5bc0bd175b710b6b815 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Mon, 13 May 2024 11:10:03 +0200 Subject: [PATCH 13/50] Some minor cleanups. --- src/legacy/implode.rs | 2 +- src/legacy/reduce.rs | 15 ++++++++++++--- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/src/legacy/implode.rs b/src/legacy/implode.rs index bc30279dc..9e2ce89c3 100644 --- a/src/legacy/implode.rs +++ b/src/legacy/implode.rs @@ -15,7 +15,7 @@ fn read_huffman_code( num_lens: usize, d: &mut HuffmanDecoder, ) -> std::io::Result<()> { - let mut lens = [0; 256]; + let mut lens = [0; 1 << 8]; let mut len_count = [0; 17]; // debug_assert!(num_lens <= sizeof(lens) / sizeof(lens[0])); diff --git a/src/legacy/reduce.rs b/src/legacy/reduce.rs index 94acf3328..de43eb1e0 100644 --- a/src/legacy/reduce.rs +++ b/src/legacy/reduce.rs @@ -91,7 +91,7 @@ fn max_dist(comp_factor: u8) -> usize { debug_assert!(comp_factor >= 1 && comp_factor <= 4); let v_dist_bits = comp_factor as usize; // Bits in V * 256 + W byte + implicit 1. */ - ((1 << v_dist_bits) - 1) * 256 + u8::MAX as usize + 1 + 1 << (v_dist_bits + 8) } const DLE_BYTE: u8 = 144; @@ -104,7 +104,7 @@ fn hwexpand( src_used: &mut usize, dst: &mut VecDeque, ) -> io::Result<()> { - let mut fsets = [FollowerSet::default(); 256]; + let mut fsets = [FollowerSet::default(); 1 << 8]; debug_assert!(comp_factor >= 1 && comp_factor <= 4); let mut is = BitStream::new(src, src_len); @@ -222,7 +222,7 @@ impl Read for ReduceDecoder { mod tests { use std::collections::VecDeque; - use crate::legacy::reduce::follower_idx_bw; + use crate::legacy::reduce::{follower_idx_bw, max_dist}; use super::hwexpand; @@ -476,4 +476,13 @@ mod tests { assert_eq!(orig_follower_idx_bw(i), follower_idx_bw(i)); } } + + #[test] + fn test_max_dist() { + for i in 1..=4 { + let v_dist_bits = i as usize; + let c = ((1 << v_dist_bits) - 1) * 256 + 255 + 1; + assert_eq!(max_dist(i), c); + } + } } From d8067eb3ba292db56bfbda7c10c09088f64559d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Tue, 14 May 2024 11:16:15 +0200 Subject: [PATCH 14/50] Added suggestions from PR review. --- src/compression.rs | 10 +- src/legacy/bitstream.rs | 2 +- src/legacy/implode.rs | 22 +--- src/legacy/reduce.rs | 196 ++-------------------------------- tests/implode_hamlet_256.bin | 2 + tests/reduce_hamlet_2048.bin | Bin 0 -> 1285 bytes tests/reduce_zero_reduced.bin | Bin 0 -> 1297 bytes 7 files changed, 21 insertions(+), 211 deletions(-) create mode 100644 tests/implode_hamlet_256.bin create mode 100644 tests/reduce_hamlet_2048.bin create mode 100644 tests/reduce_zero_reduced.bin diff --git a/src/compression.rs b/src/compression.rs index e32dd1851..e345eca15 100644 --- a/src/compression.rs +++ b/src/compression.rs @@ -57,11 +57,11 @@ pub enum CompressionMethod { impl CompressionMethod { pub const STORE: Self = CompressionMethod::Stored; pub const SHRINK: Self = CompressionMethod::Shrink; - pub const REDUCE_1: Self = CompressionMethod::Unsupported(2); - pub const REDUCE_2: Self = CompressionMethod::Unsupported(3); - pub const REDUCE_3: Self = CompressionMethod::Unsupported(4); - pub const REDUCE_4: Self = CompressionMethod::Unsupported(5); - pub const IMPLODE: Self = CompressionMethod::Unsupported(6); + pub const REDUCE_1: Self = CompressionMethod::Reduce(1); + pub const REDUCE_2: Self = CompressionMethod::Reduce(2); + pub const REDUCE_3: Self = CompressionMethod::Reduce(3); + pub const REDUCE_4: Self = CompressionMethod::Reduce(4); + pub const IMPLODE: Self = CompressionMethod::Implode; #[cfg(feature = "_deflate-any")] pub const DEFLATE: Self = CompressionMethod::Deflated; #[cfg(not(feature = "_deflate-any"))] diff --git a/src/legacy/bitstream.rs b/src/legacy/bitstream.rs index 9662e420f..954f9ef94 100644 --- a/src/legacy/bitstream.rs +++ b/src/legacy/bitstream.rs @@ -11,7 +11,7 @@ pub fn lsb(x: u64, n: u8) -> u64 { pub fn reverse_lsb(x: u16, n: usize) -> u16 { debug_assert!(n > 0); debug_assert!(n <= 16); - return x.reverse_bits() >> (16 - n); + x.reverse_bits() >> (16 - n) } /// Input bitstream. diff --git a/src/legacy/implode.rs b/src/legacy/implode.rs index 9e2ce89c3..76d3d3eda 100644 --- a/src/legacy/implode.rs +++ b/src/legacy/implode.rs @@ -255,32 +255,14 @@ mod tests { use super::hwexplode; - const HAMLET_256: [u8; 249] = [ - 0x0d, 0x02, 0x01, 0x12, 0x23, 0x14, 0x15, 0x36, 0x37, 0x68, 0x89, 0x9a, 0xdb, 0x3c, 0x05, - 0x06, 0x12, 0x13, 0x44, 0xc5, 0xf6, 0x96, 0xf7, 0xdf, 0xef, 0xfe, 0xdd, 0x50, 0x21, 0x54, - 0xb9, 0x6f, 0xd5, 0x96, 0x1d, 0x4b, 0x17, 0xe4, 0xd1, 0xba, 0x74, 0xcb, 0xba, 0x15, 0x5b, - 0x56, 0xee, 0x59, 0x90, 0x45, 0x85, 0xbe, 0x7d, 0xbb, 0x16, 0xe4, 0x5b, 0xb3, 0x20, 0x91, - 0x86, 0x6d, 0xcb, 0xb6, 0x2c, 0x5d, 0x96, 0x20, 0xc5, 0xe6, 0x05, 0x79, 0x35, 0x2d, 0x5b, - 0xb6, 0x69, 0x9c, 0x37, 0xc8, 0xa9, 0x68, 0xc3, 0xae, 0x2d, 0x3b, 0x17, 0x6e, 0xd9, 0xb0, - 0x72, 0xcb, 0xe8, 0xaf, 0xe0, 0x4d, 0x15, 0x6d, 0xda, 0xb9, 0x20, 0xcb, 0xbc, 0x37, 0xe4, - 0x37, 0xfb, 0x56, 0x2e, 0x48, 0xba, 0x68, 0xcb, 0x82, 0xac, 0x3b, 0xb7, 0x8c, 0xff, 0x0c, - 0xeb, 0x36, 0xef, 0x5b, 0xb7, 0x65, 0x8c, 0xe7, 0x1d, 0xea, 0xf5, 0xbe, 0xc2, 0xb7, 0x9b, - 0xee, 0x5e, 0xd5, 0x6d, 0x9a, 0x74, 0x4d, 0x26, 0x59, 0xd3, 0x0d, 0x63, 0xbc, 0xe7, 0x74, - 0x3f, 0x19, 0x63, 0xdd, 0xf6, 0xed, 0x1c, 0xa0, 0xfb, 0x0d, 0xf7, 0xfd, 0x6f, 0x38, 0xd9, - 0x9a, 0xee, 0x9c, 0xfe, 0xa1, 0x3e, 0xef, 0x40, 0x6b, 0x36, 0xe9, 0xeb, 0x7c, 0x83, 0x74, - 0xfb, 0x16, 0xe4, 0x98, 0xf1, 0xd1, 0x7e, 0xd4, 0xcb, 0x7f, 0xa3, 0x41, 0xde, 0x6c, 0xe6, - 0xdb, 0xf5, 0xe2, 0x5f, 0xd9, 0x0a, 0x79, 0xcb, 0x4d, 0x13, 0x54, 0xa7, 0x61, 0x57, 0xf8, - 0x2b, 0x5d, 0xb5, 0xef, 0xb9, 0x6f, 0xcb, 0xda, 0x49, 0xd6, 0x2e, 0x41, 0x82, 0xcc, 0xfa, - 0xb6, 0x2e, 0xc8, 0xb6, 0x61, 0xf3, 0xe8, 0x3f, 0x1c, 0xe2, 0x9d, 0x06, 0xa9, 0x9f, 0x4d, - 0x6b, 0xc7, 0xe8, 0x19, 0xfb, 0x9d, 0xea, 0x63, 0xbb, - ]; + const HAMLET_256: &[u8; 249] = include_bytes!("../../tests/implode_hamlet_256.bin"); #[test] fn test_explode_hamlet_256() { let mut src_used = HAMLET_256.len(); let mut dst = VecDeque::new(); hwexplode( - &HAMLET_256, + HAMLET_256, HAMLET_256.len(), 256, false, diff --git a/src/legacy/reduce.rs b/src/legacy/reduce.rs index de43eb1e0..949710fd5 100644 --- a/src/legacy/reduce.rs +++ b/src/legacy/reduce.rs @@ -8,13 +8,13 @@ use super::bitstream::{lsb, BitStream}; /// Number of bits used to represent indices in a follower set of size n. fn follower_idx_bw(n: u8) -> u8 { debug_assert!(n <= 32); - if n == 0 { - return 0; - } - if n == 1 { - return 1; + match n { + 0 => 0, + 1 => 1, + _ => { + 5 - ((n - 1) << 3).leading_zeros() as u8 + } } - 5 - ((n - 1) << 3).leading_zeros() as u8 } #[derive(Default, Clone, Copy)] @@ -226,101 +226,14 @@ mod tests { use super::hwexpand; - const HAMLET_2048: [u8; 1285] = [ - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x58, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x01, 0x0f, 0x06, 0x11, - 0x31, 0x21, 0x1f, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x04, 0x99, 0x00, 0x00, 0x00, 0x00, 0x20, 0x80, 0xbc, 0x01, 0xc4, 0x5d, 0x1a, 0x5a, 0x98, - 0x50, 0x06, 0x49, 0xcc, 0xb9, 0xd1, 0x91, 0x11, 0x65, 0x20, 0x68, 0x73, 0x04, 0x08, 0x24, - 0x5d, 0x19, 0x51, 0x06, 0x02, 0x99, 0x06, 0x08, 0x6c, 0x61, 0x84, 0x9c, 0x5b, 0x1d, 0x1d, - 0x02, 0xf9, 0x76, 0x46, 0x36, 0x46, 0x57, 0x96, 0x26, 0x40, 0x86, 0x11, 0x65, 0x61, 0x90, - 0x6c, 0x00, 0x40, 0xb8, 0xd1, 0xcd, 0xd5, 0x09, 0x61, 0x65, 0x02, 0x64, 0x9d, 0xf0, 0x06, - 0x42, 0x40, 0xca, 0xb9, 0x81, 0x10, 0x20, 0x90, 0x69, 0x65, 0x04, 0x24, 0xdd, 0x1b, 0x9a, - 0x50, 0xa6, 0x4e, 0xc8, 0xd1, 0xb9, 0xcd, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, - 0x00, 0xe9, 0x22, 0x50, 0x11, 0x11, 0x20, 0x68, 0x52, 0x49, 0x80, 0x40, 0x15, 0x04, 0x00, - 0x80, 0xf0, 0x26, 0x04, 0x08, 0x61, 0x41, 0x02, 0x24, 0x08, 0x00, 0x08, 0x4f, 0x45, 0x00, - 0x20, 0x48, 0x39, 0x09, 0x61, 0x45, 0x02, 0x1a, 0x15, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, - 0x00, 0x02, 0x09, 0x00, 0x00, 0x00, 0x00, 0x02, 0xa4, 0x1b, 0x00, 0x00, 0x80, 0x00, 0xd2, - 0x00, 0x08, 0x20, 0x90, 0x80, 0xa0, 0x22, 0x0e, 0x00, 0x01, 0x24, 0x00, 0x00, 0x00, 0x00, - 0x20, 0x77, 0x61, 0x53, 0x6f, 0x50, 0x45, 0x90, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x28, 0x00, 0x80, 0x00, 0x09, 0x05, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xef, 0xbb, 0xbf, 0x0d, 0x28, 0xf7, 0xad, 0x5a, 0xd9, - 0x31, 0xe9, 0x51, 0x1d, 0xc1, 0x62, 0xe8, 0x59, 0x10, 0x2d, 0xf4, 0xf6, 0xed, 0x1a, 0x88, - 0x35, 0x33, 0xd2, 0xb0, 0x6d, 0xd9, 0x90, 0x2e, 0x0b, 0xc5, 0xe6, 0xf1, 0x2a, 0x2d, 0x9b, - 0xa7, 0x0d, 0xdb, 0x16, 0x84, 0xd0, 0xb8, 0x56, 0x76, 0x2e, 0xdc, 0xb2, 0x61, 0xc0, 0x06, - 0x36, 0x90, 0x4a, 0xd3, 0x88, 0x65, 0xf0, 0x97, 0x34, 0xa2, 0x19, 0x50, 0x3a, 0xea, 0x75, - 0x30, 0xc0, 0x27, 0x8c, 0xf3, 0x14, 0x03, 0x0c, 0xee, 0xa8, 0xe0, 0x69, 0x00, 0xef, 0xa8, - 0xea, 0xe6, 0x42, 0x32, 0x10, 0xdd, 0x30, 0xe1, 0x1c, 0x84, 0xb6, 0x81, 0x6d, 0xdf, 0xce, - 0x51, 0x66, 0x2a, 0xb9, 0x48, 0x67, 0x01, 0x1f, 0x24, 0x20, 0xbd, 0xfb, 0x86, 0x6c, 0xc9, - 0x20, 0x52, 0x37, 0x09, 0x72, 0x0c, 0x30, 0x12, 0x46, 0x03, 0x48, 0x0c, 0x22, 0xd9, 0xe8, - 0x33, 0xca, 0x06, 0xca, 0xe1, 0x1c, 0xcb, 0xf9, 0x98, 0xa6, 0x7d, 0xd3, 0x39, 0x00, 0x91, - 0xbf, 0x2d, 0x6b, 0x87, 0xba, 0x10, 0x64, 0xd6, 0x1b, 0x83, 0x6c, 0x73, 0x1e, 0xc7, 0x18, - 0x6e, 0x1e, 0xd3, 0x94, 0x85, 0x67, 0xd3, 0xda, 0xe1, 0x69, 0x92, 0xbc, 0xf3, 0x3c, 0x0c, - 0x2a, 0x87, 0x2d, 0x90, 0xb0, 0x9a, 0xa6, 0x0d, 0xac, 0x93, 0x19, 0x07, 0x7a, 0xe9, 0xa0, - 0x6d, 0x50, 0x20, 0x24, 0x03, 0x74, 0x30, 0x4d, 0x3b, 0xb6, 0x8c, 0x00, 0x34, 0x6e, 0x98, - 0x6d, 0x9d, 0x8d, 0x04, 0x8f, 0x74, 0x9c, 0xc6, 0x0d, 0x70, 0x22, 0xe1, 0x0d, 0x32, 0x65, - 0x9b, 0x16, 0x12, 0xf4, 0xe9, 0x04, 0x40, 0x97, 0x67, 0xac, 0xd0, 0x72, 0xf9, 0x86, 0x67, - 0x5d, 0x08, 0x32, 0xc9, 0xcc, 0x79, 0x32, 0x88, 0x00, 0xee, 0x26, 0x56, 0xb6, 0x6f, 0xc7, - 0x86, 0x85, 0xb4, 0x08, 0xc8, 0x13, 0x1f, 0x0d, 0x50, 0x03, 0x24, 0x8b, 0xa0, 0x22, 0xb0, - 0x39, 0x48, 0x34, 0xda, 0xe1, 0x74, 0xdf, 0x82, 0x1c, 0xb3, 0xc7, 0xae, 0x41, 0x96, 0x40, - 0xcb, 0xa6, 0x77, 0x21, 0x5b, 0xac, 0x8c, 0x91, 0xd2, 0x72, 0xf3, 0xe0, 0x13, 0x6b, 0x79, - 0x72, 0x03, 0x00, 0x18, 0xe4, 0x02, 0x2e, 0x31, 0x9a, 0x01, 0x9a, 0x66, 0x1a, 0x08, 0x6f, - 0x05, 0x59, 0x56, 0xec, 0xdb, 0xb7, 0x6b, 0x2e, 0x21, 0xad, 0x18, 0xb2, 0x44, 0x72, 0x9a, - 0xb2, 0xa1, 0x8e, 0x29, 0xe4, 0x21, 0x4d, 0x3b, 0xa8, 0x8e, 0xfc, 0x86, 0x3a, 0xb2, 0x41, - 0xbe, 0xd4, 0xb2, 0x6c, 0x18, 0x66, 0x3b, 0x11, 0x42, 0x1d, 0x3a, 0xd1, 0x8e, 0x6d, 0xc5, - 0x90, 0xc6, 0xe4, 0xe4, 0xe0, 0x80, 0xdc, 0x82, 0x3c, 0x12, 0x34, 0x12, 0x53, 0x23, 0x43, - 0xd3, 0xd5, 0x40, 0x26, 0x4c, 0xad, 0x0a, 0x97, 0x4c, 0x40, 0xae, 0x03, 0x95, 0x85, 0x4b, - 0x17, 0xf2, 0xc0, 0xca, 0x4c, 0x18, 0x16, 0xca, 0xc0, 0xc4, 0xe4, 0x40, 0x2a, 0x52, 0x26, - 0x48, 0x0e, 0x7b, 0xb6, 0xac, 0x0e, 0xda, 0x8d, 0xb2, 0x4d, 0x63, 0xb4, 0x90, 0xda, 0x35, - 0x04, 0x18, 0x76, 0x4c, 0x90, 0xce, 0x39, 0x9d, 0x96, 0x11, 0x99, 0x8c, 0xa0, 0x3a, 0xac, - 0xa2, 0x51, 0x0b, 0x0e, 0xa4, 0xfa, 0xa9, 0x40, 0x10, 0xa2, 0x1a, 0x24, 0x05, 0x3e, 0x19, - 0x81, 0xa4, 0x8a, 0x34, 0x69, 0x0a, 0x04, 0xa5, 0x3e, 0x29, 0x15, 0x1d, 0x12, 0x8f, 0xaa, - 0x58, 0xa4, 0x45, 0x3c, 0x02, 0xd1, 0x42, 0x4f, 0x4f, 0x4b, 0x46, 0x1a, 0xd4, 0xc4, 0xb4, - 0x28, 0x15, 0xaa, 0x40, 0x48, 0x82, 0x87, 0x2c, 0xa2, 0x4b, 0x87, 0x78, 0x74, 0x02, 0x1b, - 0x5e, 0x0e, 0xe1, 0x04, 0x0d, 0x25, 0x8f, 0x44, 0xd3, 0x86, 0xb1, 0x1b, 0xbb, 0x50, 0xd9, - 0x30, 0x42, 0x8a, 0x0f, 0xaa, 0x48, 0x06, 0x49, 0x45, 0x8f, 0x8a, 0x12, 0xcd, 0x82, 0x04, - 0x35, 0xc8, 0x03, 0x4d, 0x2c, 0xa0, 0xd4, 0x24, 0xa7, 0x43, 0x8b, 0x42, 0x02, 0x1f, 0x91, - 0x6e, 0x0a, 0x92, 0xba, 0xc4, 0x8a, 0xa6, 0x06, 0xf8, 0x83, 0x30, 0xc3, 0x83, 0x91, 0xa1, - 0x6f, 0x52, 0x50, 0xad, 0x12, 0x6e, 0x87, 0xc4, 0xa4, 0x06, 0x4e, 0x8d, 0x2d, 0x23, 0x7b, - 0x92, 0x0b, 0x9a, 0xed, 0xdc, 0x34, 0x08, 0xd0, 0x85, 0x41, 0x20, 0x8e, 0xd4, 0x0c, 0x6c, - 0x63, 0x05, 0x31, 0x24, 0x8e, 0x1d, 0x1a, 0x66, 0x66, 0x43, 0x97, 0x90, 0x14, 0x03, 0x99, - 0x41, 0x46, 0xee, 0xdb, 0xb7, 0x6d, 0xa0, 0xf0, 0x9c, 0xb0, 0x0c, 0x6b, 0xf2, 0x42, 0x1e, - 0x98, 0xe1, 0x81, 0x4c, 0x12, 0x24, 0xa5, 0xa4, 0x21, 0x08, 0xbe, 0x65, 0xfb, 0x26, 0x37, - 0x8a, 0xc3, 0x1c, 0xa2, 0x7d, 0x23, 0x14, 0x81, 0xcb, 0x4a, 0x52, 0x49, 0xd0, 0x21, 0x24, - 0xd5, 0xb5, 0x02, 0x3a, 0xdb, 0xd0, 0x2b, 0x39, 0x6c, 0xfb, 0x66, 0xa0, 0x4c, 0x2f, 0xe4, - 0x1a, 0x5e, 0x48, 0x0a, 0x85, 0x4c, 0xc0, 0x0d, 0x39, 0xa1, 0x1b, 0x52, 0x28, 0xec, 0xac, - 0xf0, 0x13, 0x52, 0x06, 0xa4, 0x42, 0x0a, 0xc1, 0x14, 0x24, 0x17, 0x7c, 0x04, 0x81, 0x44, - 0x23, 0x9b, 0x29, 0x07, 0x20, 0x2c, 0x0f, 0x42, 0x90, 0xd0, 0xee, 0x06, 0x87, 0x96, 0x42, - 0x8a, 0x42, 0x4a, 0x2b, 0x64, 0x63, 0x12, 0x52, 0x14, 0x84, 0x9c, 0x71, 0x0a, 0x29, 0x11, - 0x27, 0x94, 0x68, 0x84, 0x43, 0xd3, 0x00, 0xa3, 0xd4, 0x88, 0x96, 0x71, 0x9b, 0x20, 0x82, - 0x43, 0xb6, 0x58, 0x85, 0xec, 0x02, 0x33, 0xc1, 0x8a, 0x15, 0x42, 0x71, 0x69, 0x85, 0x3c, - 0xfc, 0x42, 0x1e, 0xa9, 0x86, 0xbc, 0xf1, 0x30, 0xe6, 0x75, 0xe5, 0x8e, 0x79, 0xde, 0x30, - 0x24, 0x13, 0x4b, 0x6c, 0x42, 0x0e, 0x3b, 0x96, 0xa8, 0xdc, 0xb0, 0x6d, 0x6a, 0x1a, 0x81, - 0x65, 0x3a, 0xf7, 0x4d, 0x87, 0x4d, 0x21, 0x87, 0xc5, 0x83, 0x6c, 0x13, 0x28, 0x67, 0x20, - 0x8a, 0x6d, 0xe3, 0xc1, 0xfb, 0x50, 0x26, 0xab, 0x9c, 0x54, 0x75, 0x8a, 0x85, 0x4b, 0x0c, - 0x62, 0x87, 0x7c, 0xb0, 0xc1, 0x62, 0xb2, 0xd1, 0x90, 0x45, 0xc4, 0x15, 0xa2, 0xcc, 0x0f, - 0xa4, 0x62, 0x1f, 0x21, 0x31, 0x45, 0x15, 0x72, 0x59, 0xba, 0x6c, 0xc4, 0x98, 0xb5, 0x34, - 0x10, 0x15, 0xba, 0x34, 0x1b, 0x16, 0x72, 0x58, 0x4f, 0x17, 0x79, 0x54, 0x04, 0x5c, 0xa5, - 0x59, 0x2c, 0x66, 0x54, 0xdd, 0xb2, 0x65, 0x84, 0x0a, 0xaf, 0xda, 0x28, 0xf6, 0x98, 0x85, - 0x6e, 0xf2, 0x2e, 0x08, 0xa8, 0x59, 0xc8, 0x72, 0x13, 0x86, 0xb2, 0x69, 0x9d, 0x69, 0x74, - 0x11, 0x9f, 0x98, 0x3e, 0x39, 0x85, 0x74, 0x4e, 0xa6, 0x6f, 0x48, 0x86, 0x43, 0x10, 0x72, - 0xd4, 0x0d, 0xa4, 0xd1, 0xba, 0x48, 0x26, 0x8b, 0x60, 0xd1, 0x29, 0x16, 0xe8, 0x4d, 0x30, - 0x2a, 0x1d, 0x72, 0xcd, 0xa4, 0x8b, 0x7c, 0x82, 0x42, 0x32, 0xd3, 0xa4, 0x20, 0x16, 0x12, - 0xb1, 0xee, 0x59, 0xb4, 0x90, 0xa3, 0x26, 0x20, 0x2f, 0x7c, 0x20, 0x21, 0x25, 0x95, 0x9f, - 0x58, 0x68, 0x24, 0xe7, 0x65, 0x34, 0x0d, 0x7b, 0xc2, 0xb9, 0xbe, 0x2e, 0xd2, 0xe8, 0x49, - 0x0a, 0x3b, 0x29, 0xe5, 0x14, 0xe4, 0x0c, 0x18, 0x27, 0x00, - ]; + const HAMLET_2048: &[u8; 1285] = include_bytes!("../../tests/reduce_hamlet_2048.bin"); #[test] fn test_expand_hamlet2048() { let mut dst = VecDeque::new(); let mut src_used = 0; hwexpand( - &HAMLET_2048, + HAMLET_2048, HAMLET_2048.len(), 2048, 4, @@ -342,102 +255,15 @@ mod tests { $ dosbox -c "mount c ." -c "c:" -c "pkzip -ea4 a.zip a" -c "exit" $ xxd -i -s 31 -l $(expr $(find A.ZIP -printf %s) - 100) A.ZIP */ - const ZEROS_REDUCED: [u8; 1297] = [ - 0xc2, 0x3f, 0x24, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x58, 0x07, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x01, 0x0f, - 0x06, 0x11, 0x31, 0x21, 0x1f, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x04, 0x99, 0x00, 0x00, 0x00, 0x00, 0x20, 0x80, 0xbc, 0x01, 0xc4, 0x5d, 0x1a, - 0x5a, 0x98, 0x50, 0x06, 0x49, 0xcc, 0xb9, 0xd1, 0x91, 0x11, 0x65, 0x20, 0x68, 0x73, 0x04, - 0x08, 0x24, 0x5d, 0x19, 0x51, 0x06, 0x02, 0x99, 0x06, 0x08, 0x6c, 0x61, 0x84, 0x9c, 0x5b, - 0x1d, 0x1d, 0x02, 0xf9, 0x76, 0x46, 0x36, 0x46, 0x57, 0x96, 0x26, 0x40, 0x86, 0x11, 0x65, - 0x61, 0x90, 0x6c, 0x00, 0x40, 0xb8, 0xd1, 0xcd, 0xd5, 0x09, 0x61, 0x65, 0x02, 0x64, 0x9d, - 0xf0, 0x06, 0x42, 0x40, 0xca, 0xb9, 0x81, 0x10, 0x20, 0x90, 0x69, 0x65, 0x04, 0x24, 0xdd, - 0x1b, 0x9a, 0x50, 0xa6, 0x4e, 0xc8, 0xd1, 0xb9, 0xcd, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x80, 0x00, 0xe9, 0x22, 0x50, 0x11, 0x11, 0x20, 0x68, 0x52, 0x49, 0x80, 0x40, 0x15, - 0x04, 0x00, 0x80, 0xf0, 0x26, 0x04, 0x08, 0x61, 0x41, 0x02, 0x24, 0x08, 0x00, 0x08, 0x4f, - 0x45, 0x00, 0x20, 0x48, 0x39, 0x09, 0x61, 0x45, 0x02, 0x1a, 0x15, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x80, 0x00, 0x02, 0x09, 0x00, 0x00, 0x00, 0x00, 0x02, 0xa4, 0x1b, 0x00, 0x00, 0x80, - 0x00, 0xd2, 0x00, 0x08, 0x20, 0x90, 0x80, 0xa0, 0x22, 0x0e, 0x00, 0x01, 0x24, 0x00, 0x00, - 0x00, 0x00, 0x20, 0x77, 0x61, 0x53, 0x6f, 0x50, 0x45, 0x90, 0x70, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x28, 0x00, 0x80, 0x00, 0x09, - 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xef, 0xbb, 0xbf, 0x0d, 0x28, 0xf7, 0xad, - 0x5a, 0xd9, 0x31, 0xe9, 0x51, 0x1d, 0xc1, 0x62, 0xe8, 0x59, 0x10, 0x2d, 0xf4, 0xf6, 0xed, - 0x1a, 0x88, 0x35, 0x33, 0xd2, 0xb0, 0x6d, 0xd9, 0x90, 0x2e, 0x0b, 0xc5, 0xe6, 0xf1, 0x2a, - 0x2d, 0x9b, 0xa7, 0x0d, 0xdb, 0x16, 0x84, 0xd0, 0xb8, 0x56, 0x76, 0x2e, 0xdc, 0xb2, 0x61, - 0xc0, 0x06, 0x36, 0x90, 0x4a, 0xd3, 0x88, 0x65, 0xf0, 0x97, 0x34, 0xa2, 0x19, 0x50, 0x3a, - 0xea, 0x75, 0x30, 0xc0, 0x27, 0x8c, 0xf3, 0x14, 0x03, 0x0c, 0xee, 0xa8, 0xe0, 0x69, 0x00, - 0xef, 0xa8, 0xea, 0xe6, 0x42, 0x32, 0x10, 0xdd, 0x30, 0xe1, 0x1c, 0x84, 0xb6, 0x81, 0x6d, - 0xdf, 0xce, 0x51, 0x66, 0x2a, 0xb9, 0x48, 0x67, 0x01, 0x1f, 0x24, 0x20, 0xbd, 0xfb, 0x86, - 0x6c, 0xc9, 0x20, 0x52, 0x37, 0x09, 0x72, 0x0c, 0x30, 0x12, 0x46, 0x03, 0x48, 0x0c, 0x22, - 0xd9, 0xe8, 0x33, 0xca, 0x06, 0xca, 0xe1, 0x1c, 0xcb, 0xf9, 0x98, 0xa6, 0x7d, 0xd3, 0x39, - 0x00, 0x91, 0xbf, 0x2d, 0x6b, 0x87, 0xba, 0x10, 0x64, 0xd6, 0x1b, 0x83, 0x6c, 0x73, 0x1e, - 0xc7, 0x18, 0x6e, 0x1e, 0xd3, 0x94, 0x85, 0x67, 0xd3, 0xda, 0xe1, 0x69, 0x92, 0xbc, 0xf3, - 0x3c, 0x0c, 0x2a, 0x87, 0x2d, 0x90, 0xb0, 0x9a, 0xa6, 0x0d, 0xac, 0x93, 0x19, 0x07, 0x7a, - 0xe9, 0xa0, 0x6d, 0x50, 0x20, 0x24, 0x03, 0x74, 0x30, 0x4d, 0x3b, 0xb6, 0x8c, 0x00, 0x34, - 0x6e, 0x98, 0x6d, 0x9d, 0x8d, 0x04, 0x8f, 0x74, 0x9c, 0xc6, 0x0d, 0x70, 0x22, 0xe1, 0x0d, - 0x32, 0x65, 0x9b, 0x16, 0x12, 0xf4, 0xe9, 0x04, 0x40, 0x97, 0x67, 0xac, 0xd0, 0x72, 0xf9, - 0x86, 0x67, 0x5d, 0x08, 0x32, 0xc9, 0xcc, 0x79, 0x32, 0x88, 0x00, 0xee, 0x26, 0x56, 0xb6, - 0x6f, 0xc7, 0x86, 0x85, 0xb4, 0x08, 0xc8, 0x13, 0x1f, 0x0d, 0x50, 0x03, 0x24, 0x8b, 0xa0, - 0x22, 0xb0, 0x39, 0x48, 0x34, 0xda, 0xe1, 0x74, 0xdf, 0x82, 0x1c, 0xb3, 0xc7, 0xae, 0x41, - 0x96, 0x40, 0xcb, 0xa6, 0x77, 0x21, 0x5b, 0xac, 0x8c, 0x91, 0xd2, 0x72, 0xf3, 0xe0, 0x13, - 0x6b, 0x79, 0x72, 0x03, 0x00, 0x18, 0xe4, 0x02, 0x2e, 0x31, 0x9a, 0x01, 0x9a, 0x66, 0x1a, - 0x08, 0x6f, 0x05, 0x59, 0x56, 0xec, 0xdb, 0xb7, 0x6b, 0x2e, 0x21, 0xad, 0x18, 0xb2, 0x44, - 0x72, 0x9a, 0xb2, 0xa1, 0x8e, 0x29, 0xe4, 0x21, 0x4d, 0x3b, 0xa8, 0x8e, 0xfc, 0x86, 0x3a, - 0xb2, 0x41, 0xbe, 0xd4, 0xb2, 0x6c, 0x18, 0x66, 0x3b, 0x11, 0x42, 0x1d, 0x3a, 0xd1, 0x8e, - 0x6d, 0xc5, 0x90, 0xc6, 0xe4, 0xe4, 0xe0, 0x80, 0xdc, 0x82, 0x3c, 0x12, 0x34, 0x12, 0x53, - 0x23, 0x43, 0xd3, 0xd5, 0x40, 0x26, 0x4c, 0xad, 0x0a, 0x97, 0x4c, 0x40, 0xae, 0x03, 0x95, - 0x85, 0x4b, 0x17, 0xf2, 0xc0, 0xca, 0x4c, 0x18, 0x16, 0xca, 0xc0, 0xc4, 0xe4, 0x40, 0x2a, - 0x52, 0x26, 0x48, 0x0e, 0x7b, 0xb6, 0xac, 0x0e, 0xda, 0x8d, 0xb2, 0x4d, 0x63, 0xb4, 0x90, - 0xda, 0x35, 0x04, 0x18, 0x76, 0x4c, 0x90, 0xce, 0x39, 0x9d, 0x96, 0x11, 0x99, 0x8c, 0xa0, - 0x3a, 0xac, 0xa2, 0x51, 0x0b, 0x0e, 0xa4, 0xfa, 0xa9, 0x40, 0x10, 0xa2, 0x1a, 0x24, 0x05, - 0x3e, 0x19, 0x81, 0xa4, 0x8a, 0x34, 0x69, 0x0a, 0x04, 0xa5, 0x3e, 0x29, 0x15, 0x1d, 0x12, - 0x8f, 0xaa, 0x58, 0xa4, 0x45, 0x3c, 0x02, 0xd1, 0x42, 0x4f, 0x4f, 0x4b, 0x46, 0x1a, 0xd4, - 0xc4, 0xb4, 0x28, 0x15, 0xaa, 0x40, 0x48, 0x82, 0x87, 0x2c, 0xa2, 0x4b, 0x87, 0x78, 0x74, - 0x02, 0x1b, 0x5e, 0x0e, 0xe1, 0x04, 0x0d, 0x25, 0x8f, 0x44, 0xd3, 0x86, 0xb1, 0x1b, 0xbb, - 0x50, 0xd9, 0x30, 0x42, 0x8a, 0x0f, 0xaa, 0x48, 0x06, 0x49, 0x45, 0x8f, 0x8a, 0x12, 0xcd, - 0x82, 0x04, 0x35, 0xc8, 0x03, 0x4d, 0x2c, 0xa0, 0xd4, 0x24, 0xa7, 0x43, 0x8b, 0x42, 0x02, - 0x1f, 0x91, 0x6e, 0x0a, 0x92, 0xba, 0xc4, 0x8a, 0xa6, 0x06, 0xf8, 0x83, 0x30, 0xc3, 0x83, - 0x91, 0xa1, 0x6f, 0x52, 0x50, 0xad, 0x12, 0x6e, 0x87, 0xc4, 0xa4, 0x06, 0x4e, 0x8d, 0x2d, - 0x23, 0x7b, 0x92, 0x0b, 0x9a, 0xed, 0xdc, 0x34, 0x08, 0xd0, 0x85, 0x41, 0x20, 0x8e, 0xd4, - 0x0c, 0x6c, 0x63, 0x05, 0x31, 0x24, 0x8e, 0x1d, 0x1a, 0x66, 0x66, 0x43, 0x97, 0x90, 0x14, - 0x03, 0x99, 0x41, 0x46, 0xee, 0xdb, 0xb7, 0x6d, 0xa0, 0xf0, 0x9c, 0xb0, 0x0c, 0x6b, 0xf2, - 0x42, 0x1e, 0x98, 0xe1, 0x81, 0x4c, 0x12, 0x24, 0xa5, 0xa4, 0x21, 0x08, 0xbe, 0x65, 0xfb, - 0x26, 0x37, 0x8a, 0xc3, 0x1c, 0xa2, 0x7d, 0x23, 0x14, 0x81, 0xcb, 0x4a, 0x52, 0x49, 0xd0, - 0x21, 0x24, 0xd5, 0xb5, 0x02, 0x3a, 0xdb, 0xd0, 0x2b, 0x39, 0x6c, 0xfb, 0x66, 0xa0, 0x4c, - 0x2f, 0xe4, 0x1a, 0x5e, 0x48, 0x0a, 0x85, 0x4c, 0xc0, 0x0d, 0x39, 0xa1, 0x1b, 0x52, 0x28, - 0xec, 0xac, 0xf0, 0x13, 0x52, 0x06, 0xa4, 0x42, 0x0a, 0xc1, 0x14, 0x24, 0x17, 0x7c, 0x04, - 0x81, 0x44, 0x23, 0x9b, 0x29, 0x07, 0x20, 0x2c, 0x0f, 0x42, 0x90, 0xd0, 0xee, 0x06, 0x87, - 0x96, 0x42, 0x8a, 0x42, 0x4a, 0x2b, 0x64, 0x63, 0x12, 0x52, 0x14, 0x84, 0x9c, 0x71, 0x0a, - 0x29, 0x11, 0x27, 0x94, 0x68, 0x84, 0x43, 0xd3, 0x00, 0xa3, 0xd4, 0x88, 0x96, 0x71, 0x9b, - 0x20, 0x82, 0x43, 0xb6, 0x58, 0x85, 0xec, 0x02, 0x33, 0xc1, 0x8a, 0x15, 0x42, 0x71, 0x69, - 0x85, 0x3c, 0xfc, 0x42, 0x1e, 0xa9, 0x86, 0xbc, 0xf1, 0x30, 0xe6, 0x75, 0xe5, 0x8e, 0x79, - 0xde, 0x30, 0x24, 0x13, 0x4b, 0x6c, 0x42, 0x0e, 0x3b, 0x96, 0xa8, 0xdc, 0xb0, 0x6d, 0x6a, - 0x1a, 0x81, 0x65, 0x3a, 0xf7, 0x4d, 0x87, 0x4d, 0x21, 0x87, 0xc5, 0x83, 0x6c, 0x13, 0x28, - 0x67, 0x20, 0x8a, 0x6d, 0xe3, 0xc1, 0xfb, 0x50, 0x26, 0xab, 0x9c, 0x54, 0x75, 0x8a, 0x85, - 0x4b, 0x0c, 0x62, 0x87, 0x7c, 0xb0, 0xc1, 0x62, 0xb2, 0xd1, 0x90, 0x45, 0xc4, 0x15, 0xa2, - 0xcc, 0x0f, 0xa4, 0x62, 0x1f, 0x21, 0x31, 0x45, 0x15, 0x72, 0x59, 0xba, 0x6c, 0xc4, 0x98, - 0xb5, 0x34, 0x10, 0x15, 0xba, 0x34, 0x1b, 0x16, 0x72, 0x58, 0x4f, 0x17, 0x79, 0x54, 0x04, - 0x5c, 0xa5, 0x59, 0x2c, 0x66, 0x54, 0xdd, 0xb2, 0x65, 0x84, 0x0a, 0xaf, 0xda, 0x28, 0xf6, - 0x98, 0x85, 0x6e, 0xf2, 0x2e, 0x08, 0xa8, 0x59, 0xc8, 0x72, 0x13, 0x86, 0xb2, 0x69, 0x9d, - 0x69, 0x74, 0x11, 0x9f, 0x98, 0x3e, 0x39, 0x85, 0x74, 0x4e, 0xa6, 0x6f, 0x48, 0x86, 0x43, - 0x10, 0x72, 0xd4, 0x0d, 0xa4, 0xd1, 0xba, 0x48, 0x26, 0x8b, 0x60, 0xd1, 0x29, 0x16, 0xe8, - 0x4d, 0x30, 0x2a, 0x1d, 0x72, 0xcd, 0xa4, 0x8b, 0x7c, 0x82, 0x42, 0x32, 0xd3, 0xa4, 0x20, - 0x16, 0x12, 0xb1, 0xee, 0x59, 0xb4, 0x90, 0xa3, 0x26, 0x20, 0x2f, 0x7c, 0x20, 0x21, 0x25, - 0x95, 0x9f, 0x58, 0x68, 0x24, 0xe7, 0x65, 0x34, 0x0d, 0x7b, 0xc2, 0xb9, 0xbe, 0x2e, 0xd2, - 0xe8, 0x49, 0x0a, 0x3b, 0x29, 0xe5, 0x14, 0xe4, 0x0c, 0x18, 0x27, 0x42, 0xfe, 0x07, 0xff, - 0x83, 0xff, 0xc1, 0xff, 0x77, 0xff, 0x01, - ]; + const ZEROS_REDUCED: &[u8; 1297] = include_bytes!("../../tests/reduce_zero_reduced.bin"); #[test] fn test_expand_zeros() { + let mut dst = VecDeque::new(); let mut src_used = 0; hwexpand( - &ZEROS_REDUCED, + ZEROS_REDUCED, ZEROS_REDUCED.len(), 2048 + 1024, 4, diff --git a/tests/implode_hamlet_256.bin b/tests/implode_hamlet_256.bin new file mode 100644 index 000000000..02806fc7a --- /dev/null +++ b/tests/implode_hamlet_256.bin @@ -0,0 +1,2 @@ + #67h<DP!ToՖKѺt˺[VYE}[ m˶,] y5-[i7ȩhî-;nٰrMmڹ ˼77V.Hh˂; 6[e·^mtM&Y ct?c o8ٚ>@k6|t~Al_ +yMTaW+]oI.A.ȶa?Mkc \ No newline at end of file diff --git a/tests/reduce_hamlet_2048.bin b/tests/reduce_hamlet_2048.bin new file mode 100644 index 0000000000000000000000000000000000000000..4b6ba60df7a995f106b2ac9be750b4d1ca802ad4 GIT binary patch literal 1285 zcmZQzzycZ=BG|DA6D_2`$j>Hds3^~bq=RK9h@;T3hw(_PRMd1bH?%h_Wy=d{ATINOWXU;b7qK zcV$rUu;fg1Ws(vFt8QRm;si06mPj)&G%#FZ;82**ut15Afl&n{tWcg9oFCvip#a2z z0|5=7QchMV|NZX$yc*xvM%^@g87O-&=|!Y~?w4x1@9(yplL|rRoJBkaLT=0+JW4lT7@uN0^-$*Y&l$^V zFIzH9+^?J6zDpqGnsjqcvD|TqJh{tLTGKDzdYCzB&u1GRt#;iB8)hxzT{Bsdz3SzH z+yDg?<`M&6>uo&@CV4Y*=k~Jnm&`fFTcGri*C=(in9!G(EDqDt*IX$2*_Iy5VRZ6L zrBMgNJGHQF`N!K@w{V;gmgfy%R_R`#w87HDy7^%U1iY;q}@wP|6W<`YF<>lJ-}+N?G? z?z^%nMK^lpS4`meH_*?Pi^%U|8(G# zkA&E%14o`XXa%Wx@KtYH!*{E9lW+2t3AapHB+7gyoU@!eO>kz<0;@HP0=fB?{95TC zut-XU)lRZ;Nta0`7t2yRO;K5){#6l6Ty2;xI{Ev1yGdO+vPDC5m4io9yUrr-_KFfF z={UZJEWE1yE|=RjO79N1Y2ehwzsiHn)3v`#=xh^<=?P|Eods7^mOFPlG09KNZ-l-GV_37cQ9u5$Gx?pbf|m~dQZbyVoP!jqHCYN*mDE0vb! zJbi))^Grv#cel6aF8DBK15frRC%G998-0XSmM&4`*q8cS&AjWd%%WOlk;cB zRjzJjvbudi+cM{O+5#W_CsJ`9T&+F_cr6!72Wh-n^FcU>ZHW`tK@k=48kR;E<=L9- z3Of8w6E3`CYoF%S<>aNEk}MP?(lV!zOH)vNN=A$GWroF9I;IuQR%mkG7SZ~K$@pNG zs8eBPtIZ!Lxs`2uJ{mkLecD%f&p<`kJI9I7dfJLR8*;Ox8dI&l`?mWkwjXWI5!Ogo z=*oS3@OOaP>Nz2$U9H|cN$oWo4km57IKlOZ=%O?HOOoUj4P8ZxB6sB+nX%PGKy;Uh zv{+Guzj$Q`OU%+powSgO(vC>0Nr^$z`qiROAVdggOHN%zf2P literal 0 HcmV?d00001 diff --git a/tests/reduce_zero_reduced.bin b/tests/reduce_zero_reduced.bin new file mode 100644 index 0000000000000000000000000000000000000000..c0cba06f25d877687ae8c376d3b8a4b59f73954b GIT binary patch literal 1297 zcmX?Pufl)@G%!T4V-Y4=NP&@`P0&zLo(V|@%S;eQpkyxpy83AmbXLeqkD442{ zQOv@j5-S<*Y7T9JsfiPE7#wz7JbRThF_kH0?gus} zhf_Nn1r#P^rn0Eqm7W!_%4$fUx- zz~S%8px|N2ndr(SB??yEz`(=_VlXX{W?*PwxWvGrFri_A5+4I2I9wFU6NB>uTqhJj zjE0f|8bGC-tWe(j-TQepzORkCY4|cw_F&SBNCDk1-`+}fm>OT&kb8529{15_AGLI6 zFXz22)^cG-Sef3PO^FBC%qDnU?nwPG-DHtufYqx~g9GY4pGBB?-mQ3$$?$%~t7lF| z0(T7_%Cu~2%)NguFimTxM>?atio)LCZ8;|ug3LLKcnpNxm_2xuZoV)+#dhkU%;}#q zmepRiWSF>LH@kh8K*}}g=A2@=;}Ustm#4I*U%vG)bJCvAHauGGx)V0cTE@F(vLt)e z%LTas3M$Mc2ENwYdKgUdX5`N8W$7=ObBwn@=^?LC>TEHgFE3dfrl+sDQ1r7cJ(k1h z!U#LOV^gh|hE7UQfmDUN*B$gnrJw`c1qu9ev2QZ#GR!amI>ioVt>`u?<8 zZF1aqWmAqsnzf*ltkuQ7+@lkYJ$dq=;ZBo{kcm*Rvh(Gu4r)GYxu*L#tYeMj21 zz$qUIu~P?*JaNzpQuE-e-nNGCR_`X?kEV8=Mc(Zd zB}~$Bd=FW8Rr_5ow{4W(9dOgYsf&M=2b-sBf0xkNCKl5Z%)UAcuBa?`?sj65pP0ur zY1fgiWo$p14GuR?T$mpeuvRFq{m2qFzg}JC>Pg(Q-rh0cxX|jT(07F=Cz;hyrB7BW zEzNoQ1QF($j&AR6Z_i!uVa^7g>`zW|Gafej2&pVxqR6o?^|zXN*I}7OwaOxmr@ewa zFDR;9-O6Nj`+~M*&hNAZKKf6j;yk!oeGc$iE|d<^c(dk%a1h%PC$57cD&jRPjV{Ww zHQ5z(_?;$Pc*oX0&8f@DOFJc5C`hDbP9c}3p!$@I7U#2^+MFY- zk*?5{`}pAR0JYU~LQ1<@y?K(_Yc?EA+H`S(>k-jKXZV*S$txPViWWug$~iJ)tBHW< zE)!|7q6mNS$`F>ArI9*mA$K>Wws5V#rSWY>Yu+b4junw7iiF!XWzNkk5u886&a$<{ zZ&|)ao3lXC72YKmcX_CFCtTDNd*N%KC0lfMNq0??lhNfR3SvSV-$ibjuvkq&zeYh( sb?W?x43+1pCcM>$cJ9-=^um+NTJx#M6CMe5r+@7KoBtpDU;dvF02l||asU7T literal 0 HcmV?d00001 From d1b7efdbc2f49208813e5112fff970e05e20a57a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Tue, 14 May 2024 21:37:38 +0200 Subject: [PATCH 15/50] Update src/legacy/huffman.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Chris Hennick <4961925+Pr0methean@users.noreply.github.com> Signed-off-by: Mike Krüger --- src/legacy/huffman.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/legacy/huffman.rs b/src/legacy/huffman.rs index e94a9f098..d2d0b4d08 100644 --- a/src/legacy/huffman.rs +++ b/src/legacy/huffman.rs @@ -122,7 +122,7 @@ impl HuffmanDecoder { } /// Use the decoder d to decode a symbol from the LSB-first zero-padded bits. - /// Returns the decoded symbol number or -1 if no symbol could be decoded. + /// Returns the decoded symbol number or an error if no symbol could be decoded. /// *num_used_bits will be set to the number of bits used to decode the symbol, /// or zero if no symbol could be decoded. pub fn huffman_decode(&mut self, bits: u16, num_used_bits: &mut u8) -> std::io::Result { From 6f573409c99737e722c4d9538805ac133eb53f4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Tue, 14 May 2024 21:43:07 +0200 Subject: [PATCH 16/50] Revert "Refactored huffman implementation a bit." This reverts commit fafdaf14846fb49026adfaf1cbe687c6ab2d897b. --- src/legacy/huffman.rs | 85 +++++++++++++++++++++---------------------- src/legacy/implode.rs | 4 +- 2 files changed, 44 insertions(+), 45 deletions(-) diff --git a/src/legacy/huffman.rs b/src/legacy/huffman.rs index d2d0b4d08..4f068d97b 100644 --- a/src/legacy/huffman.rs +++ b/src/legacy/huffman.rs @@ -23,9 +23,9 @@ pub struct HuffmanDecoder { /// Lookup table for fast decoding of short codewords. pub table: [TableEntry; 1 << HUFFMAN_LOOKUP_TABLE_BITS], /// "Sentinel bits" value for each codeword length. - pub sentinel_bits: [u32; MAX_HUFFMAN_BITS], + pub sentinel_bits: [u32; MAX_HUFFMAN_BITS + 1], /// First symbol index minus first codeword mod 2**16 for each length. - pub offset_first_sym_idx: [u16; MAX_HUFFMAN_BITS], + pub offset_first_sym_idx: [u16; MAX_HUFFMAN_BITS + 1], /// Map from symbol index to symbol. pub syms: [u16; MAX_HUFFMAN_SYMBOLS], // num_syms:usize @@ -48,65 +48,62 @@ impl Default for HuffmanDecoder { /// Returns false if the codeword lengths do not correspond to a valid prefix /// code. impl HuffmanDecoder { - pub fn init(&mut self, lengths: &[u8], n: usize) -> std::io::Result<()> { - let mut count = [0; MAX_HUFFMAN_BITS]; - let mut code = [0; MAX_HUFFMAN_BITS]; - let mut sym_idx: [u16; 16] = [0; MAX_HUFFMAN_BITS]; + pub fn init(&mut self, lengths: &[u8], n: usize) -> bool { + let mut count = [0; MAX_HUFFMAN_BITS + 1]; + let mut code = [0; MAX_HUFFMAN_BITS + 1]; + let mut sym_idx = [0; MAX_HUFFMAN_BITS + 1]; // Zero-initialize the lookup table. for t in &mut self.table { t.len = 0; } // Count the number of codewords of each length. - for sym in 0..n { - let len = lengths[sym] as usize; - // Ignore zero-length codewords. - if len == 0 { - continue; - } - debug_assert!(len < MAX_HUFFMAN_BITS); - count[len] += 1; + for i in 0..n { + debug_assert!(lengths[i] as usize <= MAX_HUFFMAN_BITS); + count[lengths[i] as usize] += 1; } - - for len in 1..MAX_HUFFMAN_BITS { + count[0] = 0; // Ignore zero-length codewords. + // Compute sentinel_bits and offset_first_sym_idx for each length. + code[0] = 0; + sym_idx[0] = 0; + for l in 1..=MAX_HUFFMAN_BITS { // First canonical codeword of this length. - code[len] = (code[len - 1] + count[len - 1]) << 1; + code[l] = ((code[l - 1] + count[l - 1]) << 1) as u16; - if count[len] != 0 && code[len] as u32 + count[len] as u32 - 1 > (1u32 << len) - 1 { - return Err(Error::new( - io::ErrorKind::InvalidData, - "The last codeword is longer than len bits", - )); + if count[l] != 0 && code[l] as u32 + count[l] as u32 - 1 > (1u32 << l) - 1 { + // The last codeword is longer than l bits. + return false; } - let s = ((code[len] as u32 + count[len] as u32) << (MAX_HUFFMAN_BITS - len)) as u32; - self.sentinel_bits[len] = s; - debug_assert!(self.sentinel_bits[len] >= code[len] as u32, "No overflow!"); - sym_idx[len] = sym_idx[len - 1] + count[len - 1]; - self.offset_first_sym_idx[len] = sym_idx[len].wrapping_sub(code[len]); + let s = ((code[l] as u32 + count[l] as u32) << (MAX_HUFFMAN_BITS - l)) as u32; + self.sentinel_bits[l] = s; + debug_assert!(self.sentinel_bits[l] >= code[l] as u32, "No overflow!"); + + sym_idx[l] = sym_idx[l - 1] + count[l - 1]; + self.offset_first_sym_idx[l] = sym_idx[l].wrapping_sub(code[l]); } // Build mapping from index to symbol and populate the lookup table. - for sym in 0..n { - let len = lengths[sym] as usize; - if len == 0 { + for i in 0..n { + let l = lengths[i] as usize; + if l == 0 { continue; } - self.syms[sym_idx[len] as usize] = sym as u16; - sym_idx[len] += 1; + self.syms[sym_idx[l] as usize] = i as u16; + sym_idx[l] += 1; - if len < HUFFMAN_LOOKUP_TABLE_BITS as usize { - self.table_insert(sym, len, code[len]); - code[len] += 1; + if l <= HUFFMAN_LOOKUP_TABLE_BITS as usize { + self.table_insert(i, l, code[l]); + code[l] += 1; } } - Ok(()) + true } pub fn table_insert(&mut self, sym: usize, len: usize, codeword: u16) { - debug_assert!(len < HUFFMAN_LOOKUP_TABLE_BITS as usize); + debug_assert!(len <= HUFFMAN_LOOKUP_TABLE_BITS as usize); let codeword = reverse_lsb(codeword, len); // Make it LSB-first. let pad_len = HUFFMAN_LOOKUP_TABLE_BITS as usize - len; @@ -131,7 +128,7 @@ impl HuffmanDecoder { debug_assert!(lookup_bits < self.table.len()); if self.table[lookup_bits].len != 0 { - debug_assert!(self.table[lookup_bits].len < HUFFMAN_LOOKUP_TABLE_BITS); + debug_assert!(self.table[lookup_bits].len <= HUFFMAN_LOOKUP_TABLE_BITS); // debug_assert!(self.table[lookup_bits].sym < self.num_syms); *num_used_bits = self.table[lookup_bits].len; return Ok(self.table[lookup_bits].sym); @@ -139,15 +136,15 @@ impl HuffmanDecoder { // Then do canonical decoding with the bits in MSB-first order. let mut bits = reverse_lsb(bits, MAX_HUFFMAN_BITS); - for l in HUFFMAN_LOOKUP_TABLE_BITS as usize + 1..MAX_HUFFMAN_BITS { - if self.sentinel_bits[l] > bits as u32 { + for l in HUFFMAN_LOOKUP_TABLE_BITS as usize + 1..=MAX_HUFFMAN_BITS { + if (bits as u32) < self.sentinel_bits[l] { bits >>= MAX_HUFFMAN_BITS - l; - let sym_idx = self.offset_first_sym_idx[l] + bits; - // debug_assert(sym_idx < self.num_syms); + let sym_idx = (self.offset_first_sym_idx[l] as usize + bits as usize) & 0xFFFF; + //assert(sym_idx < self.num_syms); *num_used_bits = l as u8; - return Ok(self.syms[sym_idx as usize]); + return Ok(self.syms[sym_idx]); } } *num_used_bits = 0; @@ -187,7 +184,7 @@ mod tests { ]; let mut d = HuffmanDecoder::default(); - d.init(&lens, lens.len()).unwrap(); + assert!(d.init(&lens, lens.len())); let mut used = 0; // 000 (msb-first) -> 000 (lsb-first) diff --git a/src/legacy/implode.rs b/src/legacy/implode.rs index 76d3d3eda..54bd5d82e 100644 --- a/src/legacy/implode.rs +++ b/src/legacy/implode.rs @@ -78,7 +78,9 @@ fn read_huffman_code( )); } - d.init(&lens, num_lens) + let ok = d.init(&lens, num_lens); + debug_assert!(ok, "The checks above mean the tree should be valid."); + Ok(()) } fn hwexplode( From cdc08b97ec0e1b08e1efd0f01a72b1ba672a12de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Tue, 14 May 2024 21:45:24 +0200 Subject: [PATCH 17/50] Some refactorings in huffman code. --- src/legacy/huffman.rs | 11 +++++++---- src/legacy/implode.rs | 4 +--- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/legacy/huffman.rs b/src/legacy/huffman.rs index 4f068d97b..d5019fc10 100644 --- a/src/legacy/huffman.rs +++ b/src/legacy/huffman.rs @@ -48,7 +48,7 @@ impl Default for HuffmanDecoder { /// Returns false if the codeword lengths do not correspond to a valid prefix /// code. impl HuffmanDecoder { - pub fn init(&mut self, lengths: &[u8], n: usize) -> bool { + pub fn init(&mut self, lengths: &[u8], n: usize) -> std::io::Result<()> { let mut count = [0; MAX_HUFFMAN_BITS + 1]; let mut code = [0; MAX_HUFFMAN_BITS + 1]; let mut sym_idx = [0; MAX_HUFFMAN_BITS + 1]; @@ -72,7 +72,10 @@ impl HuffmanDecoder { if count[l] != 0 && code[l] as u32 + count[l] as u32 - 1 > (1u32 << l) - 1 { // The last codeword is longer than l bits. - return false; + return Err(Error::new( + io::ErrorKind::InvalidData, + "The last codeword is longer than len bits", + )); } let s = ((code[l] as u32 + count[l] as u32) << (MAX_HUFFMAN_BITS - l)) as u32; @@ -99,7 +102,7 @@ impl HuffmanDecoder { } } - true + Ok(()) } pub fn table_insert(&mut self, sym: usize, len: usize, codeword: u16) { @@ -184,7 +187,7 @@ mod tests { ]; let mut d = HuffmanDecoder::default(); - assert!(d.init(&lens, lens.len())); + d.init(&lens, lens.len()).unwrap(); let mut used = 0; // 000 (msb-first) -> 000 (lsb-first) diff --git a/src/legacy/implode.rs b/src/legacy/implode.rs index 54bd5d82e..76d3d3eda 100644 --- a/src/legacy/implode.rs +++ b/src/legacy/implode.rs @@ -78,9 +78,7 @@ fn read_huffman_code( )); } - let ok = d.init(&lens, num_lens); - debug_assert!(ok, "The checks above mean the tree should be valid."); - Ok(()) + d.init(&lens, num_lens) } fn hwexplode( From 9f96b074061a88ae500bb79cc9ddaebf51057777 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Tue, 14 May 2024 22:01:55 +0200 Subject: [PATCH 18/50] Added PR changes. --- src/legacy/bitstream.rs | 8 +++++++- src/legacy/huffman.rs | 6 +++--- src/legacy/implode.rs | 6 ++---- src/legacy/reduce.rs | 8 +++----- src/legacy/shrink.rs | 12 ++++++------ 5 files changed, 21 insertions(+), 19 deletions(-) diff --git a/src/legacy/bitstream.rs b/src/legacy/bitstream.rs index 954f9ef94..78492956a 100644 --- a/src/legacy/bitstream.rs +++ b/src/legacy/bitstream.rs @@ -82,6 +82,12 @@ impl<'a> BitStream<'a> { pub fn bytes_read(&self) -> usize { (self.bitpos + 7) / 8 } + + pub fn read_next_bits(&mut self, code_size: u8) -> std::io::Result { + let b = self.bits(); + self.advance(code_size)?; + Ok(lsb(b, code_size) ) + } } pub const ISTREAM_MIN_BITS: usize = 64 - 7; @@ -134,7 +140,7 @@ mod tests { #[test] fn test_istream_case1() { - let bits = [0x45, 048]; + let bits = [0x45, 0x30]; let mut is = super::BitStream::new(&bits, 9); assert_eq!(lsb(is.bits(), 3), 0x05); is.advance(3).unwrap(); diff --git a/src/legacy/huffman.rs b/src/legacy/huffman.rs index d5019fc10..52c6395c5 100644 --- a/src/legacy/huffman.rs +++ b/src/legacy/huffman.rs @@ -195,15 +195,15 @@ mod tests { assert_eq!(used, 3); /* 011 (msb-first) -> 110 (lsb-first)*/ - assert_eq!(d.huffman_decode(0x6, &mut used).unwrap(), 3); + assert_eq!(d.huffman_decode(0b110, &mut used).unwrap(), 0b011); assert_eq!(used, 3); /* 11110 (msb-first) -> 01111 (lsb-first)*/ - assert_eq!(d.huffman_decode(0x0f, &mut used).unwrap(), 17); + assert_eq!(d.huffman_decode(0b1111, &mut used).unwrap(), 0b10001); assert_eq!(used, 5); /* 111110 (msb-first) -> 011111 (lsb-first)*/ - assert_eq!(d.huffman_decode(0x1f, &mut used).unwrap(), 16); + assert_eq!(d.huffman_decode(0b11111, &mut used).unwrap(), 0b10000); assert_eq!(used, 6); /* 1111111 (msb-first) -> 1111111 (lsb-first)*/ diff --git a/src/legacy/implode.rs b/src/legacy/implode.rs index 76d3d3eda..ff013a9af 100644 --- a/src/legacy/implode.rs +++ b/src/legacy/implode.rs @@ -20,14 +20,12 @@ fn read_huffman_code( // debug_assert!(num_lens <= sizeof(lens) / sizeof(lens[0])); // Number of bytes representing the Huffman code. - let byte = lsb(is.bits(), 8); + let byte = is.read_next_bits(8)?; let num_bytes = (byte + 1) as usize; - is.advance(8)?; let mut codeword_idx = 0; for _byte_idx in 0..num_bytes { - let byte = lsb(is.bits(), 8); - is.advance(8)?; + let byte = is.read_next_bits(8)?; let codeword_len = (byte & 0xf) + 1; /* Low four bits plus one. */ let run_length = (byte >> 4) + 1; /* High four bits plus one. */ diff --git a/src/legacy/reduce.rs b/src/legacy/reduce.rs index 949710fd5..1032f6c06 100644 --- a/src/legacy/reduce.rs +++ b/src/legacy/reduce.rs @@ -12,7 +12,7 @@ fn follower_idx_bw(n: u8) -> u8 { 0 => 0, 1 => 1, _ => { - 5 - ((n - 1) << 3).leading_zeros() as u8 + 8 - (n - 1).leading_zeros() as u8 } } } @@ -27,20 +27,18 @@ struct FollowerSet { /// Read the follower sets from is into fsets. Returns true on success. fn read_follower_sets(is: &mut BitStream, fsets: &mut [FollowerSet]) -> io::Result<()> { for i in (0..=u8::MAX as usize).rev() { - let n = lsb(is.bits(), 6) as u8; + let n = is.read_next_bits(6)? as u8; if n > 32 { return Err(io::Error::new( io::ErrorKind::InvalidData, "Invalid follower set", )); } - is.advance(6)?; fsets[i].size = n; fsets[i].idx_bw = follower_idx_bw(n); for j in 0..fsets[i].size as usize { - fsets[i].followers[j] = is.bits() as u8; - is.advance(8)?; + fsets[i].followers[j] = is.read_next_bits(8)? as u8; } } diff --git a/src/legacy/shrink.rs b/src/legacy/shrink.rs index 6618ed6df..dc8d83a3c 100644 --- a/src/legacy/shrink.rs +++ b/src/legacy/shrink.rs @@ -1,7 +1,7 @@ use std::collections::VecDeque; use std::io::{self, copy, Error, Read}; -use super::bitstream::{lsb, BitStream}; +use super::bitstream::BitStream; const MIN_CODE_SIZE: u8 = 9; const MAX_CODE_SIZE: u8 = 13; @@ -127,8 +127,7 @@ fn read_code( queue: &mut CodeQueue, ) -> io::Result> { // assert(sizeof(code) * CHAR_BIT >= *code_size); - let code = lsb(is.bits(), *code_size) as u16; - is.advance(*code_size)?; + let code = is.read_next_bits(*code_size)? as u16; // Handle regular codes (the common case). if code != CONTROL_CODE as u16 { @@ -136,10 +135,11 @@ fn read_code( } // Handle control codes. - let control_code = lsb(is.bits(), *code_size); - if is.advance(*code_size).is_err() { + let control_code = if let Ok(c) = is.read_next_bits(*code_size) { + c + } else { return Ok(None); - } + }; if control_code == INC_CODE_SIZE && *code_size < MAX_CODE_SIZE { (*code_size) += 1; return read_code(is, code_size, codetab, queue); From 45e81169593988ddc7cbc384d94a09eac74209ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Tue, 14 May 2024 22:03:17 +0200 Subject: [PATCH 19/50] Update src/legacy/huffman.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Chris Hennick <4961925+Pr0methean@users.noreply.github.com> Signed-off-by: Mike Krüger --- src/legacy/huffman.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/legacy/huffman.rs b/src/legacy/huffman.rs index 52c6395c5..f0170401d 100644 --- a/src/legacy/huffman.rs +++ b/src/legacy/huffman.rs @@ -53,9 +53,7 @@ impl HuffmanDecoder { let mut code = [0; MAX_HUFFMAN_BITS + 1]; let mut sym_idx = [0; MAX_HUFFMAN_BITS + 1]; // Zero-initialize the lookup table. - for t in &mut self.table { - t.len = 0; - } + self.table.fill(TableEntry::default()); // Count the number of codewords of each length. for i in 0..n { From 9f88d254de7d82e3d507bbdc937874be2721433e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Thu, 16 May 2024 20:15:50 +0200 Subject: [PATCH 20/50] Revert "Removed legacy-zip feature." This reverts commit 4189412db89465fc0a6d0b23f4d75f5ba97e3e73. --- Cargo.toml | 2 ++ src/compression.rs | 18 ++++++++++++++++++ src/lib.rs | 1 + src/read.rs | 14 ++++++++++++++ src/write.rs | 3 +++ 5 files changed, 38 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index 7492316fd..361f17eca 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -73,6 +73,7 @@ deflate-zlib = ["flate2/zlib", "_deflate-any"] deflate-zlib-ng = ["flate2/zlib-ng", "_deflate-any"] deflate-zopfli = ["zopfli", "_deflate-any"] lzma = ["lzma-rs/stream"] +legacy-zip = [] unreserved = [] default = [ "aes-crypto", @@ -84,6 +85,7 @@ default = [ "lzma", "time", "zstd", + "legacy-zip", ] [[bench]] diff --git a/src/compression.rs b/src/compression.rs index e345eca15..3f58dd6b0 100644 --- a/src/compression.rs +++ b/src/compression.rs @@ -40,10 +40,13 @@ pub enum CompressionMethod { Lzma, /// Legacy format + #[cfg(feature = "legacy-zip")] Shrink, /// Reduce (Method 2-5) + #[cfg(feature = "legacy-zip")] Reduce(u8), /// Method 6 Implode/explode + #[cfg(feature = "legacy-zip")] Implode, /// Unsupported compression method #[cfg_attr( @@ -56,11 +59,17 @@ pub enum CompressionMethod { /// All compression methods defined for the ZIP format impl CompressionMethod { pub const STORE: Self = CompressionMethod::Stored; + #[cfg(feature = "legacy-zip")] pub const SHRINK: Self = CompressionMethod::Shrink; + #[cfg(feature = "legacy-zip")] pub const REDUCE_1: Self = CompressionMethod::Reduce(1); + #[cfg(feature = "legacy-zip")] pub const REDUCE_2: Self = CompressionMethod::Reduce(2); + #[cfg(feature = "legacy-zip")] pub const REDUCE_3: Self = CompressionMethod::Reduce(3); + #[cfg(feature = "legacy-zip")] pub const REDUCE_4: Self = CompressionMethod::Reduce(4); + #[cfg(feature = "legacy-zip")] pub const IMPLODE: Self = CompressionMethod::Implode; #[cfg(feature = "_deflate-any")] pub const DEFLATE: Self = CompressionMethod::Deflated; @@ -106,11 +115,17 @@ impl CompressionMethod { #[allow(deprecated)] match val { 0 => CompressionMethod::Stored, + #[cfg(feature = "legacy-zip")] 1 => CompressionMethod::Shrink, + #[cfg(feature = "legacy-zip")] 2 => CompressionMethod::Reduce(1), + #[cfg(feature = "legacy-zip")] 3 => CompressionMethod::Reduce(2), + #[cfg(feature = "legacy-zip")] 4 => CompressionMethod::Reduce(3), + #[cfg(feature = "legacy-zip")] 5 => CompressionMethod::Reduce(4), + #[cfg(feature = "legacy-zip")] 6 => CompressionMethod::Implode, #[cfg(feature = "_deflate-any")] 8 => CompressionMethod::Deflated, @@ -138,8 +153,11 @@ impl CompressionMethod { #[allow(deprecated)] match self { CompressionMethod::Stored => 0, + #[cfg(feature = "legacy-zip")] CompressionMethod::Shrink => 1, + #[cfg(feature = "legacy-zip")] CompressionMethod::Reduce(n) => 1 + n as u16, + #[cfg(feature = "legacy-zip")] CompressionMethod::Implode => 6, #[cfg(feature = "_deflate-any")] diff --git a/src/lib.rs b/src/lib.rs index b3f209e70..f693aa4cc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -47,6 +47,7 @@ mod types; pub mod write; mod zipcrypto; pub use extra_fields::ExtraField; +#[cfg(feature = "legacy-zip")] mod legacy; #[doc = "Unstable APIs\n\ diff --git a/src/read.rs b/src/read.rs index 84eda0152..55e12bfea 100644 --- a/src/read.rs +++ b/src/read.rs @@ -6,7 +6,9 @@ use crate::compression::CompressionMethod; use crate::cp437::FromCp437; use crate::crc32::Crc32Reader; use crate::extra_fields::{ExtendedTimestamp, ExtraField}; +#[cfg(feature = "legacy-zip")] use crate::legacy::ShrinkDecoder; +#[cfg(feature = "legacy-zip")] use crate::legacy::{ImplodeDecoder, ReduceDecoder}; use crate::read::zip_archive::Shared; use crate::result::{ZipError, ZipResult}; @@ -144,8 +146,11 @@ pub(crate) enum ZipFileReader<'a> { NoReader, Raw(io::Take<&'a mut dyn Read>), Stored(Crc32Reader>), + #[cfg(feature = "legacy-zip")] Shrink(Crc32Reader>>), + #[cfg(feature = "legacy-zip")] Reduce(Crc32Reader>>), + #[cfg(feature = "legacy-zip")] Implode(Crc32Reader>>), #[cfg(feature = "_deflate-any")] Deflated(Crc32Reader>>), @@ -165,8 +170,11 @@ impl<'a> Read for ZipFileReader<'a> { ZipFileReader::NoReader => panic!("ZipFileReader was in an invalid state"), ZipFileReader::Raw(r) => r.read(buf), ZipFileReader::Stored(r) => r.read(buf), + #[cfg(feature = "legacy-zip")] ZipFileReader::Shrink(r) => r.read(buf), + #[cfg(feature = "legacy-zip")] ZipFileReader::Reduce(r) => r.read(buf), + #[cfg(feature = "legacy-zip")] ZipFileReader::Implode(r) => r.read(buf), #[cfg(feature = "_deflate-any")] ZipFileReader::Deflated(r) => r.read(buf), @@ -189,6 +197,7 @@ impl<'a> ZipFileReader<'a> { ZipFileReader::NoReader => panic!("ZipFileReader was in an invalid state"), ZipFileReader::Raw(r) => r, ZipFileReader::Stored(r) => r.into_inner().into_inner(), + #[cfg(feature = "legacy-zip")] ZipFileReader::Shrink(r) => { // Lzma reader owns its buffer rather than mutably borrowing it, so we have to drop // it separately @@ -197,6 +206,7 @@ impl<'a> ZipFileReader<'a> { } return; } + #[cfg(feature = "legacy-zip")] ZipFileReader::Reduce(r) => { // Lzma reader owns its buffer rather than mutably borrowing it, so we have to drop // it separately @@ -205,6 +215,7 @@ impl<'a> ZipFileReader<'a> { } return; } + #[cfg(feature = "legacy-zip")] ZipFileReader::Implode(r) => { // Lzma reader owns its buffer rather than mutably borrowing it, so we have to drop // it separately @@ -332,6 +343,7 @@ pub(crate) fn make_reader( crc32, ae2_encrypted, ))), + #[cfg(feature = "legacy-zip")] CompressionMethod::Shrink => { let reader = ShrinkDecoder::new(reader, uncompressed_size); Ok(ZipFileReader::Shrink(Crc32Reader::new( @@ -340,6 +352,7 @@ pub(crate) fn make_reader( ae2_encrypted, ))) } + #[cfg(feature = "legacy-zip")] CompressionMethod::Reduce(comp_factor) => { let reader = ReduceDecoder::new(reader, uncompressed_size, comp_factor); Ok(ZipFileReader::Reduce(Crc32Reader::new( @@ -348,6 +361,7 @@ pub(crate) fn make_reader( ae2_encrypted, ))) } + #[cfg(feature = "legacy-zip")] CompressionMethod::Implode => { let reader = ImplodeDecoder::new(reader, uncompressed_size, flags); Ok(ZipFileReader::Implode(Crc32Reader::new( diff --git a/src/write.rs b/src/write.rs index ff4568489..a80bccf5b 100644 --- a/src/write.rs +++ b/src/write.rs @@ -1515,12 +1515,15 @@ impl GenericZipWriter { Ok(Box::new(|bare| Storer(bare))) } } + #[cfg(feature = "legacy-zip")] CompressionMethod::Shrink => Err(ZipError::UnsupportedArchive( "Shrink compression unsupported", )), + #[cfg(feature = "legacy-zip")] CompressionMethod::Reduce(_) => Err(ZipError::UnsupportedArchive( "Reduce compression unsupported", )), + #[cfg(feature = "legacy-zip")] CompressionMethod::Implode => Err(ZipError::UnsupportedArchive( "Implode compression unsupported", )), From 282b551cfa9ccea620e89e6404399513957dd2ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Sun, 19 May 2024 10:02:17 +0200 Subject: [PATCH 21/50] Switched to bitstream-io. --- Cargo.toml | 4 +- src/legacy/bitstream.rs | 151 ---------------------------------------- src/legacy/huffman.rs | 91 +++++++++++++++++------- src/legacy/implode.rs | 97 +++++++------------------- src/legacy/mod.rs | 13 +++- src/legacy/reduce.rs | 69 ++++++------------ src/legacy/shrink.rs | 40 +++-------- 7 files changed, 137 insertions(+), 328 deletions(-) delete mode 100644 src/legacy/bitstream.rs diff --git a/Cargo.toml b/Cargo.toml index 07c1aa413..0ad00ec84 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -43,6 +43,7 @@ zstd = { version = "0.13.1", optional = true, default-features = false } zopfli = { version = "0.8.0", optional = true } deflate64 = { version = "0.1.8", optional = true } lzma-rs = { version = "0.3.0", default-features = false, optional = true } +bitstream-io = { version = "2.3.0", optional = true } [target.'cfg(any(all(target_arch = "arm", target_pointer_width = "32"), target_arch = "mips", target_arch = "powerpc"))'.dependencies] crossbeam-utils = "0.8.19" @@ -73,7 +74,7 @@ deflate-zlib = ["flate2/zlib", "_deflate-any"] deflate-zlib-ng = ["flate2/zlib-ng", "_deflate-any"] deflate-zopfli = ["zopfli", "_deflate-any"] lzma = ["lzma-rs/stream"] -legacy-zip = [] +legacy-zip = ["bitstream-io"] unreserved = [] default = [ "aes-crypto", @@ -85,7 +86,6 @@ default = [ "lzma", "time", "zstd", - "legacy-zip", ] [[bench]] diff --git a/src/legacy/bitstream.rs b/src/legacy/bitstream.rs deleted file mode 100644 index 78492956a..000000000 --- a/src/legacy/bitstream.rs +++ /dev/null @@ -1,151 +0,0 @@ -use std::io; - -/// Get the n least significant bits of x. -pub fn lsb(x: u64, n: u8) -> u64 { - debug_assert!(n <= 63); - x & ((1u64 << (n as u32)) - 1) -} - -/// Reverse the n least significant bits of x. -/// The (16 - n) most significant bits of the result will be zero. -pub fn reverse_lsb(x: u16, n: usize) -> u16 { - debug_assert!(n > 0); - debug_assert!(n <= 16); - x.reverse_bits() >> (16 - n) -} - -/// Input bitstream. -pub struct BitStream<'a> { - src: &'a [u8], /* Source bytes. */ - bitpos: usize, /* Position of the next bit to read. */ - bitpos_end: usize, /* Position of past-the-end bit. */ -} - -/// Initialize an input stream to present the n bytes from src as an LSB-first -/// bitstream. -impl<'a> BitStream<'a> { - pub fn new(src: &'a [u8], n: usize) -> Self { - Self { - src, - bitpos: 0, - bitpos_end: n * 8, - } - } - - /// Get the next bits from the input stream. The number of bits returned is - /// between ISTREAM_MIN_BITS and 64, depending on the position in the stream, or - /// fewer if the end of stream is reached. The upper bits are zero-padded. - pub fn bits(&mut self) -> u64 { - let next = self.bitpos / 8; - debug_assert!(next < self.src.len(), "Cannot read past end of stream."); - - let bits = if next + 8 <= self.src.len() { - // Common case: read 8 bytes in one go. - u64::from_le_bytes(self.src[next..next + 8].try_into().unwrap()) - } else { - // Read the available bytes and zero-pad. - let mut bits = 0; - for i in 0..self.src.len() - next { - bits |= (self.src[next + i] as u64) << (i as u32 * 8); - } - bits - }; - - return bits >> (self.bitpos % 8); - } - - /// Advance n bits in the bitstream if possible. Returns false if that many bits - /// are not available in the stream. - pub fn advance(&mut self, n: u8) -> std::io::Result<()> { - debug_assert!(self.bitpos <= self.bitpos_end); - - if self.bitpos_end - self.bitpos < n as usize { - return Err(io::Error::new( - io::ErrorKind::UnexpectedEof, - "End of stream", - )); - } - - self.bitpos += n as usize; - Ok(()) - } - - /// Align the input stream to the next 8-bit boundary and return a pointer to - /// that byte, which may be the past-the-end-of-stream byte. - pub fn _byte_align(&mut self) -> usize { - debug_assert!(self.bitpos <= self.bitpos_end, "Not past end of stream."); - self.bitpos = 8 * (self.bitpos / 8); - debug_assert!(self.bitpos <= self.bitpos_end, "Not past end of stream."); - return self.bitpos / 8; - } - - pub fn bytes_read(&self) -> usize { - (self.bitpos + 7) / 8 - } - - pub fn read_next_bits(&mut self, code_size: u8) -> std::io::Result { - let b = self.bits(); - self.advance(code_size)?; - Ok(lsb(b, code_size) ) - } -} - -pub const ISTREAM_MIN_BITS: usize = 64 - 7; - -#[cfg(test)] -mod tests { - use crate::legacy::bitstream::{lsb, reverse_lsb}; - - #[test] - fn test_reverse16() { - assert_eq!(reverse_lsb(0x0000, 1), 0x0); - assert_eq!(reverse_lsb(0xffff, 1), 0x1); - assert_eq!(reverse_lsb(0x0000, 16), 0x0); - assert_eq!(reverse_lsb(0xffff, 16), 0xffff); - // 0001 0010 0011 0100 -> 0010 1100 0100 1000 - assert_eq!(reverse_lsb(0x1234, 16), 0x2c48); - // 111 1111 0100 0001 -> 100 0001 0111 1111 - assert_eq!(reverse_lsb(0x7f41, 15), 0x417f); - } - - #[test] - fn test_bits_test_bits_lsbround_up() { - assert_eq!(lsb(0x1122334455667788, 0), 0x0); - assert_eq!(lsb(0x1122334455667788, 5), 0x8); - assert_eq!(lsb(0x7722334455667788, 63), 0x7722334455667788); - } - - #[test] - fn test_istream_basic() { - let bits = [0x47]; - let mut is = super::BitStream::new(&bits, 1); - - assert_eq!(lsb(is.bits(), 1), 1); - is.advance(1).unwrap(); - assert_eq!(lsb(is.bits(), 1), 1); - is.advance(1).unwrap(); - assert_eq!(lsb(is.bits(), 1), 1); - is.advance(1).unwrap(); - assert_eq!(lsb(is.bits(), 1), 0); - is.advance(1).unwrap(); - assert_eq!(lsb(is.bits(), 1), 0); - is.advance(1).unwrap(); - assert_eq!(lsb(is.bits(), 1), 0); - is.advance(1).unwrap(); - assert_eq!(lsb(is.bits(), 1), 1); - is.advance(1).unwrap(); - assert_eq!(lsb(is.bits(), 1), 0); - is.advance(1).unwrap(); - } - - #[test] - fn test_istream_case1() { - let bits = [0x45, 0x30]; - let mut is = super::BitStream::new(&bits, 9); - assert_eq!(lsb(is.bits(), 3), 0x05); - is.advance(3).unwrap(); - - assert_eq!(lsb(is.bits(), 4), 0x08); - is.advance(4).unwrap(); - } -} diff --git a/src/legacy/huffman.rs b/src/legacy/huffman.rs index f0170401d..a702a0b08 100644 --- a/src/legacy/huffman.rs +++ b/src/legacy/huffman.rs @@ -1,8 +1,8 @@ -use std::io::{self, Error}; +use std::io::{self, Error, Seek}; -use crate::legacy::bitstream::reverse_lsb; +use bitstream_io::{BitRead, BitReader, Endianness}; -use super::bitstream::lsb; +use crate::legacy::reverse_lsb; #[derive(Default, Clone, Copy)] pub struct TableEntry { @@ -123,32 +123,41 @@ impl HuffmanDecoder { /// Returns the decoded symbol number or an error if no symbol could be decoded. /// *num_used_bits will be set to the number of bits used to decode the symbol, /// or zero if no symbol could be decoded. - pub fn huffman_decode(&mut self, bits: u16, num_used_bits: &mut u8) -> std::io::Result { + pub fn huffman_decode( + &mut self, + length: u64, + is: &mut BitReader, + ) -> std::io::Result { // First try the lookup table. - let lookup_bits = lsb(bits as u64, HUFFMAN_LOOKUP_TABLE_BITS) as usize; + let read_bits1 = (HUFFMAN_LOOKUP_TABLE_BITS as u64).min(length - is.position_in_bits()?); + let lookup_bits = !is.read::(read_bits1 as u32)? as usize; debug_assert!(lookup_bits < self.table.len()); - if self.table[lookup_bits].len != 0 { debug_assert!(self.table[lookup_bits].len <= HUFFMAN_LOOKUP_TABLE_BITS); - // debug_assert!(self.table[lookup_bits].sym < self.num_syms); - *num_used_bits = self.table[lookup_bits].len; + is.seek_bits(io::SeekFrom::Current( + -(read_bits1 as i64) + self.table[lookup_bits].len as i64, + ))?; return Ok(self.table[lookup_bits].sym); } // Then do canonical decoding with the bits in MSB-first order. - let mut bits = reverse_lsb(bits, MAX_HUFFMAN_BITS); + let read_bits2 = (HUFFMAN_LOOKUP_TABLE_BITS as u64).min(length - is.position_in_bits()?); + let mut bits = reverse_lsb( + (lookup_bits | ((!is.read::(read_bits2 as u32)? as usize) << read_bits1)) as u16, + MAX_HUFFMAN_BITS, + ); + for l in HUFFMAN_LOOKUP_TABLE_BITS as usize + 1..=MAX_HUFFMAN_BITS { if (bits as u32) < self.sentinel_bits[l] { bits >>= MAX_HUFFMAN_BITS - l; - let sym_idx = (self.offset_first_sym_idx[l] as usize + bits as usize) & 0xFFFF; //assert(sym_idx < self.num_syms); - - *num_used_bits = l as u8; + is.seek_bits(io::SeekFrom::Current( + -(read_bits1 as i64 + read_bits2 as i64) + l as i64, + ))?; return Ok(self.syms[sym_idx]); } } - *num_used_bits = 0; Err(Error::new( io::ErrorKind::InvalidData, "huffman decode failed", @@ -158,8 +167,11 @@ impl HuffmanDecoder { #[cfg(test)] mod tests { - use super::HuffmanDecoder; + use std::io::Cursor; + + use bitstream_io::{BitReader, LittleEndian}; + use super::HuffmanDecoder; #[test] fn test_huffman_decode_basic() { let lens = [ @@ -187,27 +199,52 @@ mod tests { let mut d = HuffmanDecoder::default(); d.init(&lens, lens.len()).unwrap(); - let mut used = 0; // 000 (msb-first) -> 000 (lsb-first) - assert_eq!(d.huffman_decode(0x0, &mut used).unwrap(), 0); - assert_eq!(used, 3); + assert_eq!( + d.huffman_decode( + 8, + &mut BitReader::endian(&mut Cursor::new(vec![!0x0]), LittleEndian) + ) + .unwrap(), + 0 + ); /* 011 (msb-first) -> 110 (lsb-first)*/ - assert_eq!(d.huffman_decode(0b110, &mut used).unwrap(), 0b011); - assert_eq!(used, 3); + assert_eq!( + d.huffman_decode( + 8, + &mut BitReader::endian(&mut Cursor::new(vec![!0b110]), LittleEndian) + ) + .unwrap(), + 0b011 + ); /* 11110 (msb-first) -> 01111 (lsb-first)*/ - assert_eq!(d.huffman_decode(0b1111, &mut used).unwrap(), 0b10001); - assert_eq!(used, 5); + assert_eq!( + d.huffman_decode( + 8, + &mut BitReader::endian(&mut Cursor::new(vec![!0b1111]), LittleEndian) + ) + .unwrap(), + 0b10001 + ); /* 111110 (msb-first) -> 011111 (lsb-first)*/ - assert_eq!(d.huffman_decode(0b11111, &mut used).unwrap(), 0b10000); - assert_eq!(used, 6); + assert_eq!( + d.huffman_decode( + 8, + &mut BitReader::endian(&mut Cursor::new(vec![!0b11111]), LittleEndian) + ) + .unwrap(), + 0b10000 + ); /* 1111111 (msb-first) -> 1111111 (lsb-first)*/ - assert!(d.huffman_decode(0x7f, &mut used).is_err()); - - /* Make sure used is set even when decoding fails. */ - assert_eq!(used, 0); + assert!(d + .huffman_decode( + 8, + &mut BitReader::endian(&mut Cursor::new(vec![!0x7f]), LittleEndian) + ) + .is_err()); } } diff --git a/src/legacy/implode.rs b/src/legacy/implode.rs index ff013a9af..71fd773a0 100644 --- a/src/legacy/implode.rs +++ b/src/legacy/implode.rs @@ -1,17 +1,16 @@ use std::collections::VecDeque; -use std::io::{self, copy, Error, Read, Result}; +use std::io::{self, copy, Cursor, Error, Read, Result}; + +use bitstream_io::{BitRead, BitReader, Endianness, LittleEndian}; -use crate::legacy::bitstream::{lsb, ISTREAM_MIN_BITS}; use crate::legacy::lz77::lz77_output_backref; -use super::bitstream::BitStream; use super::huffman::HuffmanDecoder; -//const COMPRESSED_BYTES_TO_BUFFER: usize = 4096; /// Initialize the Huffman decoder d with num_lens codeword lengths read from is. /// Returns false if the input is invalid. -fn read_huffman_code( - is: &mut BitStream, +fn read_huffman_code( + is: &mut BitReader, num_lens: usize, d: &mut HuffmanDecoder, ) -> std::io::Result<()> { @@ -20,12 +19,12 @@ fn read_huffman_code( // debug_assert!(num_lens <= sizeof(lens) / sizeof(lens[0])); // Number of bytes representing the Huffman code. - let byte = is.read_next_bits(8)?; + let byte = is.read::(8)?; let num_bytes = (byte + 1) as usize; let mut codeword_idx = 0; for _byte_idx in 0..num_bytes { - let byte = is.read_next_bits(8)?; + let byte = is.read::(8)?; let codeword_len = (byte & 0xf) + 1; /* Low four bits plus one. */ let run_length = (byte >> 4) + 1; /* High four bits plus one. */ @@ -81,15 +80,14 @@ fn read_huffman_code( fn hwexplode( src: &[u8], - src_len: usize, uncomp_len: usize, large_wnd: bool, lit_tree: bool, pk101_bug_compat: bool, - src_used: &mut usize, dst: &mut VecDeque, ) -> std::io::Result<()> { - let mut is = BitStream::new(src, src_len); + let bit_length = src.len() as u64 * 8; + let mut is = BitReader::endian(Cursor::new(&src), LittleEndian); let mut lit_decoder = HuffmanDecoder::default(); let mut len_decoder = HuffmanDecoder::default(); let mut dist_decoder = HuffmanDecoder::default(); @@ -111,83 +109,56 @@ fn hwexplode( 2 } }; - while dst.len() < uncomp_len { - let mut bits = is.bits(); - if lsb(bits, 1) == 0x1 { + let is_literal = is.read_bit()?; + if is_literal { // Literal. - bits >>= 1; let sym; - let mut used = 0; if lit_tree { - sym = lit_decoder.huffman_decode(!bits as u16, &mut used)?; - is.advance(1 + used)?; + sym = lit_decoder.huffman_decode(bit_length, &mut is)?; } else { - sym = lsb(bits, 8) as u16; - is.advance(1 + 8)?; + sym = is.read::(8)? as u16; } debug_assert!(sym <= u8::MAX as u16); dst.push_back(sym as u8); continue; } - // Backref. - debug_assert!(lsb(bits, 1) == 0x0); - let mut used_tot = 1; - bits >>= 1; // Read the low dist bits. let mut dist; if large_wnd { - dist = lsb(bits, 7) as usize; - bits >>= 7; - used_tot += 7; + dist = is.read::(7)?; } else { - dist = lsb(bits, 6) as usize; - bits >>= 6; - used_tot += 6; + dist = is.read::(6)?; } - // Read the Huffman-encoded high dist bits. - let mut used = 0; - let sym = dist_decoder.huffman_decode(!bits as u16, &mut used)?; - used_tot += used; - bits >>= used; - dist |= (sym as usize) << if large_wnd { 7 } else { 6 }; + let sym = dist_decoder.huffman_decode(bit_length, &mut is)?; + dist |= (sym as u16) << if large_wnd { 7 } else { 6 }; dist += 1; // Read the Huffman-encoded len. - let sym = len_decoder.huffman_decode(!bits as u16, &mut used)?; - used_tot += used; - bits >>= used; + let sym = len_decoder.huffman_decode(bit_length, &mut is)?; let mut len = (sym + min_len) as usize; if sym == 63 { // Read an extra len byte. - len += lsb(bits, 8) as usize; - used_tot += 8; - // bits >>= 8; + len += is.read::(8)? as usize; } - - debug_assert!((used_tot as usize) <= ISTREAM_MIN_BITS); - is.advance(used_tot)?; - // let len = len.min(uncomp_len - dst.len()); - - if len <= uncomp_len - dst.len() && dist <= dst.len() { + let len = len.min(uncomp_len - dst.len()); + if len <= uncomp_len - dst.len() && dist as usize <= dst.len() { // Enough room and no implicit zeros; chunked copy. - lz77_output_backref(dst, dist, len); + lz77_output_backref(dst, dist as usize, len); } else { // Copy, handling overlap and implicit zeros. for _i in 0..len { - if dist > dst.len() { + if dist as usize > dst.len() { dst.push_back(0); continue; } - dst.push_back(dst[dst.len() - dist]); + dst.push_back(dst[dst.len() - dist as usize]); } } } - - *src_used = is.bytes_read(); Ok(()) } @@ -229,15 +200,12 @@ impl Read for ImplodeDecoder { if let Err(err) = self.compressed_reader.read_to_end(&mut compressed_bytes) { return Err(err.into()); } - let mut src_used = 0; hwexplode( &compressed_bytes, - compressed_bytes.len(), self.uncompressed_size as usize, self.large_wnd, self.lit_tree, false, - &mut src_used, &mut self.stream, )?; } @@ -249,27 +217,14 @@ impl Read for ImplodeDecoder { #[cfg(test)] mod tests { - use std::collections::VecDeque; - use super::hwexplode; - + use std::collections::VecDeque; const HAMLET_256: &[u8; 249] = include_bytes!("../../tests/implode_hamlet_256.bin"); #[test] fn test_explode_hamlet_256() { - let mut src_used = HAMLET_256.len(); let mut dst = VecDeque::new(); - hwexplode( - HAMLET_256, - HAMLET_256.len(), - 256, - false, - false, - false, - &mut src_used, - &mut dst, - ) - .unwrap(); + hwexplode(HAMLET_256, 256, false, false, false, &mut dst).unwrap(); assert_eq!(dst.len(), 256); } } diff --git a/src/legacy/mod.rs b/src/legacy/mod.rs index 51bcf8ffa..f86fe77eb 100644 --- a/src/legacy/mod.rs +++ b/src/legacy/mod.rs @@ -1,4 +1,3 @@ -mod bitstream; mod huffman; mod lz77; pub mod shrink; @@ -7,3 +6,15 @@ pub mod reduce; pub use reduce::*; pub mod implode; pub use implode::*; +/// Reverse the n least significant bits of x. +/// The (16 - n) most significant bits of the result will be zero. +pub fn reverse_lsb(x: u16, n: usize) -> u16 { + debug_assert!(n > 0); + debug_assert!(n <= 16); + x.reverse_bits() >> (16 - n) +} +/// Get the n least significant bits of x. +pub fn lsb(x: u64, n: u8) -> u64 { + debug_assert!(n <= 63); + x & ((1u64 << (n as u32)) - 1) +} diff --git a/src/legacy/reduce.rs b/src/legacy/reduce.rs index 1032f6c06..4cb1da9d9 100644 --- a/src/legacy/reduce.rs +++ b/src/legacy/reduce.rs @@ -1,9 +1,10 @@ use std::collections::VecDeque; use std::io::{self, copy, Read, Result}; -use crate::legacy::lz77::lz77_output_backref; +use bitstream_io::{BitRead, BitReader, Endianness, LittleEndian}; -use super::bitstream::{lsb, BitStream}; +use crate::legacy::lsb; +use crate::legacy::lz77::lz77_output_backref; /// Number of bits used to represent indices in a follower set of size n. fn follower_idx_bw(n: u8) -> u8 { @@ -11,9 +12,7 @@ fn follower_idx_bw(n: u8) -> u8 { match n { 0 => 0, 1 => 1, - _ => { - 8 - (n - 1).leading_zeros() as u8 - } + _ => 8 - (n - 1).leading_zeros() as u8, } } @@ -25,9 +24,12 @@ struct FollowerSet { } /// Read the follower sets from is into fsets. Returns true on success. -fn read_follower_sets(is: &mut BitStream, fsets: &mut [FollowerSet]) -> io::Result<()> { +fn read_follower_sets( + is: &mut BitReader, + fsets: &mut [FollowerSet], +) -> io::Result<()> { for i in (0..=u8::MAX as usize).rev() { - let n = is.read_next_bits(6)? as u8; + let n = is.read::(6)?; if n > 32 { return Err(io::Error::new( io::ErrorKind::InvalidData, @@ -38,7 +40,7 @@ fn read_follower_sets(is: &mut BitStream, fsets: &mut [FollowerSet]) -> io::Resu fsets[i].idx_bw = follower_idx_bw(n); for j in 0..fsets[i].size as usize { - fsets[i].followers[j] = is.read_next_bits(8)? as u8; + fsets[i].followers[j] = is.read::(8)?; } } @@ -48,31 +50,30 @@ fn read_follower_sets(is: &mut BitStream, fsets: &mut [FollowerSet]) -> io::Resu /// Read the next byte from is, decoded based on prev_byte and the follower sets. /// The byte is returned in *out_byte. The function returns true on success, /// and false on bad data or end of input. -fn read_next_byte(is: &mut BitStream, prev_byte: u8, fsets: &mut [FollowerSet]) -> io::Result { - let bits = is.bits(); - +fn read_next_byte( + is: &mut BitReader, + prev_byte: u8, + fsets: &mut [FollowerSet], +) -> io::Result { if fsets[prev_byte as usize].size == 0 { // No followers; read a literal byte. - is.advance(8)?; - return Ok(bits as u8); + return Ok(is.read::(8)?); } - if lsb(bits, 1) == 1 { + if is.read::(1)? == 1 { // Don't use the follower set; read a literal byte. - is.advance(1 + 8)?; - return Ok((bits >> 1) as u8); + return Ok(is.read::(8)?); } // The bits represent the index of a follower byte. let idx_bw = fsets[prev_byte as usize].idx_bw; - let follower_idx = lsb(bits >> 1, idx_bw) as usize; + let follower_idx = is.read::(idx_bw as u32)? as usize; if follower_idx >= fsets[prev_byte as usize].size as usize { return Err(io::Error::new( io::ErrorKind::InvalidData, "Invalid follower index", )); } - is.advance(1 + idx_bw)?; Ok(fsets[prev_byte as usize].followers[follower_idx]) } @@ -96,16 +97,14 @@ const DLE_BYTE: u8 = 144; fn hwexpand( src: &[u8], - src_len: usize, uncomp_len: usize, comp_factor: u8, - src_used: &mut usize, dst: &mut VecDeque, ) -> io::Result<()> { let mut fsets = [FollowerSet::default(); 1 << 8]; debug_assert!(comp_factor >= 1 && comp_factor <= 4); - let mut is = BitStream::new(src, src_len); + let mut is = BitReader::endian(src, LittleEndian); read_follower_sets(&mut is, &mut fsets)?; // Number of bits in V used for backref length. @@ -161,8 +160,6 @@ fn hwexpand( } } - *src_used = is.bytes_read(); - Ok(()) } @@ -200,13 +197,10 @@ impl Read for ReduceDecoder { if let Err(err) = self.compressed_reader.read_to_end(&mut compressed_bytes) { return Err(err.into()); } - let mut src_used = 0; hwexpand( &compressed_bytes, - compressed_bytes.len(), self.uncompressed_size as usize, self.comp_factor, - &mut src_used, &mut self.stream, )?; } @@ -229,16 +223,7 @@ mod tests { #[test] fn test_expand_hamlet2048() { let mut dst = VecDeque::new(); - let mut src_used = 0; - hwexpand( - HAMLET_2048, - HAMLET_2048.len(), - 2048, - 4, - &mut src_used, - &mut dst, - ) - .unwrap(); + hwexpand(HAMLET_2048, 2048, 4, &mut dst).unwrap(); assert_eq!(dst.len(), 2048); } @@ -257,18 +242,8 @@ mod tests { #[test] fn test_expand_zeros() { - let mut dst = VecDeque::new(); - let mut src_used = 0; - hwexpand( - ZEROS_REDUCED, - ZEROS_REDUCED.len(), - 2048 + 1024, - 4, - &mut src_used, - &mut dst, - ) - .unwrap(); + hwexpand(ZEROS_REDUCED, 2048 + 1024, 4, &mut dst).unwrap(); assert_eq!(dst.len(), 2048 + 1024); for i in 0..(1 << 10) { assert_eq!(dst[(1 << 11) + i], 0); diff --git a/src/legacy/shrink.rs b/src/legacy/shrink.rs index dc8d83a3c..85f868f48 100644 --- a/src/legacy/shrink.rs +++ b/src/legacy/shrink.rs @@ -1,15 +1,15 @@ use std::collections::VecDeque; use std::io::{self, copy, Error, Read}; -use super::bitstream::BitStream; +use bitstream_io::{BitRead, BitReader, Endianness, LittleEndian}; const MIN_CODE_SIZE: u8 = 9; const MAX_CODE_SIZE: u8 = 13; const MAX_CODE: usize = (1 << MAX_CODE_SIZE) - 1; const CONTROL_CODE: usize = 256; -const INC_CODE_SIZE: u64 = 1; -const PARTIAL_CLEAR: u64 = 2; +const INC_CODE_SIZE: u16 = 1; +const PARTIAL_CLEAR: u16 = 2; // const HASH_BITS: usize = MAX_CODE_SIZE + 1; /* For a load factor of 0.5. */ // const HASHTAB_SIZE: usize = 1 << HASH_BITS; @@ -120,14 +120,14 @@ fn unshrink_partial_clear(codetab: &mut [Codetab], queue: &mut CodeQueue) { /// Read the next code from the input stream and return it in next_code. Returns /// false if the end of the stream is reached. If the stream contains invalid /// data, next_code is set to INVALID_CODE but the return value is still true. -fn read_code( - is: &mut BitStream, +fn read_code( + is: &mut BitReader, code_size: &mut u8, codetab: &mut [Codetab], queue: &mut CodeQueue, ) -> io::Result> { // assert(sizeof(code) * CHAR_BIT >= *code_size); - let code = is.read_next_bits(*code_size)? as u16; + let code = is.read::(*code_size as u32)?; // Handle regular codes (the common case). if code != CONTROL_CODE as u16 { @@ -135,7 +135,7 @@ fn read_code( } // Handle control codes. - let control_code = if let Ok(c) = is.read_next_bits(*code_size) { + let control_code = if let Ok(c) = is.read::(*code_size as u32) { c } else { return Ok(None); @@ -235,21 +235,15 @@ fn output_code( Ok(()) } -fn hwunshrink( - src: &[u8], - src_len: usize, - uncompressed_size: usize, - src_used: &mut usize, - dst: &mut VecDeque, -) -> io::Result<()> { +fn hwunshrink(src: &[u8], uncompressed_size: usize, dst: &mut VecDeque) -> io::Result<()> { let mut codetab = Codetab::create_new(); let mut queue = CodeQueue::new(); - let mut is = BitStream::new(src, src_len); + let mut is = BitReader::endian(src, LittleEndian); + let mut code_size = MIN_CODE_SIZE; // Handle the first code separately since there is no previous code. let Ok(Some(curr_code)) = read_code(&mut is, &mut code_size, &mut codetab, &mut queue) else { - *src_used = is.bytes_read(); return Ok(()); }; @@ -333,7 +327,6 @@ fn hwunshrink( prev_code = curr_code; } - *src_used = is.bytes_read(); Ok(()) } @@ -369,12 +362,9 @@ impl Read for ShrinkDecoder { if let Err(err) = self.compressed_reader.read_to_end(&mut compressed_bytes) { return Err(err.into()); } - let mut src_used = compressed_bytes.len(); hwunshrink( &compressed_bytes, - compressed_bytes.len(), self.uncompressed_size as usize, - &mut src_used, &mut self.stream, )?; } @@ -397,15 +387,7 @@ mod tests { #[test] fn test_unshrink_lzw_fig5() { let mut dst = VecDeque::new(); - let mut src_used = 0; - hwunshrink( - &LZW_FIG5_SHRUNK, - LZW_FIG5_SHRUNK.len(), - LZW_FIG5.len(), - &mut src_used, - &mut dst, - ) - .unwrap(); + hwunshrink(&LZW_FIG5_SHRUNK, LZW_FIG5.len(), &mut dst).unwrap(); assert_eq!(dst, LZW_FIG5); } } From 9fb99ad506e8d8fe090fa855ef30d1ab68714c51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Sun, 19 May 2024 13:15:44 +0200 Subject: [PATCH 22/50] Some minor code cleanups. --- Cargo.toml | 3 ++- src/legacy/huffman.rs | 18 +++++++++--------- src/legacy/implode.rs | 17 +++++++---------- src/legacy/reduce.rs | 16 +++++----------- src/legacy/shrink.rs | 22 ++++------------------ 5 files changed, 27 insertions(+), 49 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 0ad00ec84..747b30694 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "zip" -version = "1.3.0" +version = "1.2.3" authors = [ "Mathijs van de Nes ", "Marli Frost ", @@ -86,6 +86,7 @@ default = [ "lzma", "time", "zstd", + "legacy-zip", ] [[bench]] diff --git a/src/legacy/huffman.rs b/src/legacy/huffman.rs index a702a0b08..186e55fe8 100644 --- a/src/legacy/huffman.rs +++ b/src/legacy/huffman.rs @@ -21,24 +21,24 @@ const HUFFMAN_LOOKUP_TABLE_BITS: u8 = 8; pub struct HuffmanDecoder { /// Lookup table for fast decoding of short codewords. - pub table: [TableEntry; 1 << HUFFMAN_LOOKUP_TABLE_BITS], + pub table: Vec, /// "Sentinel bits" value for each codeword length. - pub sentinel_bits: [u32; MAX_HUFFMAN_BITS + 1], + pub sentinel_bits: Vec, /// First symbol index minus first codeword mod 2**16 for each length. - pub offset_first_sym_idx: [u16; MAX_HUFFMAN_BITS + 1], + pub offset_first_sym_idx: Vec, /// Map from symbol index to symbol. - pub syms: [u16; MAX_HUFFMAN_SYMBOLS], + pub syms: Vec, // num_syms:usize } impl Default for HuffmanDecoder { fn default() -> Self { - let syms = [0; MAX_HUFFMAN_SYMBOLS]; - let table = [TableEntry::default(); 1 << HUFFMAN_LOOKUP_TABLE_BITS]; + let syms = vec![0; MAX_HUFFMAN_SYMBOLS]; + let table = vec![TableEntry::default(); 1 << HUFFMAN_LOOKUP_TABLE_BITS]; Self { table, - sentinel_bits: Default::default(), - offset_first_sym_idx: Default::default(), + sentinel_bits: vec![0; MAX_HUFFMAN_BITS + 1], + offset_first_sym_idx: vec![0; MAX_HUFFMAN_BITS + 1], syms, } } @@ -72,7 +72,7 @@ impl HuffmanDecoder { // The last codeword is longer than l bits. return Err(Error::new( io::ErrorKind::InvalidData, - "The last codeword is longer than len bits", + "the last codeword is longer than len bits", )); } diff --git a/src/legacy/implode.rs b/src/legacy/implode.rs index 71fd773a0..57caafb7b 100644 --- a/src/legacy/implode.rs +++ b/src/legacy/implode.rs @@ -1,12 +1,9 @@ +use super::huffman::HuffmanDecoder; +use super::lz77::lz77_output_backref; +use bitstream_io::{BitRead, BitReader, Endianness, LittleEndian}; use std::collections::VecDeque; use std::io::{self, copy, Cursor, Error, Read, Result}; -use bitstream_io::{BitRead, BitReader, Endianness, LittleEndian}; - -use crate::legacy::lz77::lz77_output_backref; - -use super::huffman::HuffmanDecoder; - /// Initialize the Huffman decoder d with num_lens codeword lengths read from is. /// Returns false if the input is invalid. fn read_huffman_code( @@ -36,7 +33,7 @@ fn read_huffman_code( if (codeword_idx + run_length) as usize > num_lens { return Err(Error::new( io::ErrorKind::InvalidData, - "Too many codeword lengths", + "too many codeword lengths", )); } for _ in 0..run_length { @@ -50,7 +47,7 @@ fn read_huffman_code( if (codeword_idx as usize) < num_lens { return Err(Error::new( io::ErrorKind::InvalidData, - "Not enough codeword lengths", + "not enough codeword lengths", )); } @@ -63,7 +60,7 @@ fn read_huffman_code( if avail_codewords < 0 { return Err(Error::new( io::ErrorKind::InvalidData, - "Huffman tree is not full", + "huffman tree is not full", )); } } @@ -71,7 +68,7 @@ fn read_huffman_code( // Not all codewords were used. return Err(Error::new( io::ErrorKind::InvalidData, - "Not all codewords were used", + "not all codewords were used", )); } diff --git a/src/legacy/reduce.rs b/src/legacy/reduce.rs index 4cb1da9d9..e8e037e16 100644 --- a/src/legacy/reduce.rs +++ b/src/legacy/reduce.rs @@ -33,7 +33,7 @@ fn read_follower_sets( if n > 32 { return Err(io::Error::new( io::ErrorKind::InvalidData, - "Invalid follower set", + "invalid follower set", )); } fsets[i].size = n; @@ -71,7 +71,7 @@ fn read_next_byte( if follower_idx >= fsets[prev_byte as usize].size as usize { return Err(io::Error::new( io::ErrorKind::InvalidData, - "Invalid follower index", + "invalid follower index", )); } Ok(fsets[prev_byte as usize].followers[follower_idx]) @@ -79,9 +79,7 @@ fn read_next_byte( fn max_len(comp_factor: u8) -> usize { let v_len_bits = (8 - comp_factor) as usize; - debug_assert!(comp_factor >= 1 && comp_factor <= 4); - // Bits in V + extra len byte + implicit 3. ((1 << v_len_bits) - 1) + u8::MAX as usize + 3 } @@ -93,7 +91,7 @@ fn max_dist(comp_factor: u8) -> usize { 1 << (v_dist_bits + 8) } -const DLE_BYTE: u8 = 144; +const DLE_BYTE: u8 = 0x90; fn hwexpand( src: &[u8], @@ -159,7 +157,6 @@ fn hwexpand( } } } - Ok(()) } @@ -212,12 +209,9 @@ impl Read for ReduceDecoder { #[cfg(test)] mod tests { - use std::collections::VecDeque; - - use crate::legacy::reduce::{follower_idx_bw, max_dist}; - use super::hwexpand; - + use crate::legacy::reduce::{follower_idx_bw, max_dist}; + use std::collections::VecDeque; const HAMLET_2048: &[u8; 1285] = include_bytes!("../../tests/reduce_hamlet_2048.bin"); #[test] diff --git a/src/legacy/shrink.rs b/src/legacy/shrink.rs index 85f868f48..96d02fa2e 100644 --- a/src/legacy/shrink.rs +++ b/src/legacy/shrink.rs @@ -14,21 +14,7 @@ const PARTIAL_CLEAR: u16 = 2; // const HASH_BITS: usize = MAX_CODE_SIZE + 1; /* For a load factor of 0.5. */ // const HASHTAB_SIZE: usize = 1 << HASH_BITS; const UNKNOWN_LEN: u16 = u16::MAX; -/* -#[derive(Error, Debug)] -enum ShrinkError { - #[error("self-referential code")] - InvalidPrefixCode, - #[error("first code needs to be literal")] - FirstCodeNeedsToBeLiteral, - - #[error("invalid code")] - InvalidCode, - - #[error("prev code no longer valid")] - PrevCodeNoLongerValid, -}*/ struct CodeQueue { next_idx: usize, codes: [Option; MAX_CODE as usize - CONTROL_CODE + 1], @@ -177,7 +163,7 @@ fn output_code( { // Reject invalid codes. Self-referential codes may exist in // the table but cannot be used. - return Err(io::Error::new(io::ErrorKind::InvalidData, "Invalid code")); + return Err(io::Error::new(io::ErrorKind::InvalidData, "invalid code")); } if codetab[code as usize].len != UNKNOWN_LEN { @@ -213,7 +199,7 @@ fn output_code( // The prefix code is still invalid. return Err(io::Error::new( io::ErrorKind::InvalidData, - "Invalid prefix code", + "invalid prefix code", )); } @@ -265,7 +251,7 @@ fn hwunshrink(src: &[u8], uncompressed_size: usize, dst: &mut VecDeque) -> i }; let Some(curr_code) = curr_code else { - return Err(Error::new(io::ErrorKind::InvalidData, "Invalid code")); + return Err(Error::new(io::ErrorKind::InvalidData, "invalid code")); }; let dst_pos = dst.len(); @@ -274,7 +260,7 @@ fn hwunshrink(src: &[u8], uncompressed_size: usize, dst: &mut VecDeque) -> i if codetab[prev_code as usize].prefix_code.is_none() { return Err(Error::new( io::ErrorKind::InvalidData, - "Previous code no longer valid", + "previous code no longer valid", )); } // Extend the previous code with its first byte. From 561c80f7d780a78e256c45b2b081fbbc7ed62b6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Mon, 20 May 2024 04:57:33 +0200 Subject: [PATCH 23/50] Update src/legacy/huffman.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Chris Hennick <4961925+Pr0methean@users.noreply.github.com> Signed-off-by: Mike Krüger --- src/legacy/huffman.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/legacy/huffman.rs b/src/legacy/huffman.rs index 186e55fe8..4f1a70aa2 100644 --- a/src/legacy/huffman.rs +++ b/src/legacy/huffman.rs @@ -85,7 +85,7 @@ impl HuffmanDecoder { } // Build mapping from index to symbol and populate the lookup table. - for i in 0..n { + lengths.iter().enumerate().take(n).for_each(|(i, n) { let l = lengths[i] as usize; if l == 0 { continue; From 7d99b894777d6b4d6792cf69684d8ee1d3872452 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Mon, 20 May 2024 04:58:19 +0200 Subject: [PATCH 24/50] Update src/legacy/shrink.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Chris Hennick <4961925+Pr0methean@users.noreply.github.com> Signed-off-by: Mike Krüger --- src/legacy/shrink.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/legacy/shrink.rs b/src/legacy/shrink.rs index 96d02fa2e..f16b3fa47 100644 --- a/src/legacy/shrink.rs +++ b/src/legacy/shrink.rs @@ -372,7 +372,7 @@ mod tests { #[test] fn test_unshrink_lzw_fig5() { - let mut dst = VecDeque::new(); + let mut dst = VecDeque::with_capacity(LZW_FIG5.len()); hwunshrink(&LZW_FIG5_SHRUNK, LZW_FIG5.len(), &mut dst).unwrap(); assert_eq!(dst, LZW_FIG5); } From 52263bad73606d7fcacdb712e525ac325c76f6b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Mon, 20 May 2024 04:59:34 +0200 Subject: [PATCH 25/50] Update src/read.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Chris Hennick <4961925+Pr0methean@users.noreply.github.com> Signed-off-by: Mike Krüger --- src/read.rs | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/src/read.rs b/src/read.rs index 2dfd38185..40550e0ad 100644 --- a/src/read.rs +++ b/src/read.rs @@ -198,25 +198,7 @@ impl<'a> ZipFileReader<'a> { ZipFileReader::Raw(r) => r, ZipFileReader::Stored(r) => r.into_inner().into_inner(), #[cfg(feature = "legacy-zip")] - ZipFileReader::Shrink(r) => { - // Lzma reader owns its buffer rather than mutably borrowing it, so we have to drop - // it separately - if let Ok(mut remaining) = r.into_inner().finish() { - let _ = copy(&mut remaining, &mut sink()); - } - return; - } - #[cfg(feature = "legacy-zip")] - ZipFileReader::Reduce(r) => { - // Lzma reader owns its buffer rather than mutably borrowing it, so we have to drop - // it separately - if let Ok(mut remaining) = r.into_inner().finish() { - let _ = copy(&mut remaining, &mut sink()); - } - return; - } - #[cfg(feature = "legacy-zip")] - ZipFileReader::Implode(r) => { + ZipFileReader::Shrink(r) | ZipFileReader::Reduce(r) | ZipFileReader::Implode(r) => { // Lzma reader owns its buffer rather than mutably borrowing it, so we have to drop // it separately if let Ok(mut remaining) = r.into_inner().finish() { From 2a3af2ea2ce0820a35e0df127cb1aff799359bad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Mon, 20 May 2024 05:00:46 +0200 Subject: [PATCH 26/50] Update src/legacy/shrink.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Chris Hennick <4961925+Pr0methean@users.noreply.github.com> Signed-off-by: Mike Krüger --- src/legacy/shrink.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/legacy/shrink.rs b/src/legacy/shrink.rs index f16b3fa47..97a7c2766 100644 --- a/src/legacy/shrink.rs +++ b/src/legacy/shrink.rs @@ -32,7 +32,7 @@ impl CodeQueue { // Return the next code in the queue, or INVALID_CODE if the queue is empty. fn next(&self) -> Option { // assert(q->next_idx < sizeof(q->codes) / sizeof(q->codes[0])); - self.codes[self.next_idx] + self.codes.get(self.next_idx).flatten() } /// Return and remove the next code from the queue, or return INVALID_CODE if From d6783ad6f5eaed8b2f390d61d6d0f407fd714250 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Mon, 20 May 2024 05:10:00 +0200 Subject: [PATCH 27/50] Fix version. --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index dd60edb19..e29108c87 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "zip" -version = "1.2.3" +version = "1.3.0" authors = [ "Mathijs van de Nes ", "Marli Frost ", From f6c648bfbe9fa9e073d696fb7a5d78235874202a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Mon, 20 May 2024 05:19:17 +0200 Subject: [PATCH 28/50] Fix build. --- src/legacy/huffman.rs | 6 +++--- src/legacy/shrink.rs | 6 +++++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/legacy/huffman.rs b/src/legacy/huffman.rs index 4f1a70aa2..acae2d8ee 100644 --- a/src/legacy/huffman.rs +++ b/src/legacy/huffman.rs @@ -85,10 +85,10 @@ impl HuffmanDecoder { } // Build mapping from index to symbol and populate the lookup table. - lengths.iter().enumerate().take(n).for_each(|(i, n) { + lengths.iter().enumerate().take(n).for_each(|(i, n)| { let l = lengths[i] as usize; if l == 0 { - continue; + return; } self.syms[sym_idx[l] as usize] = i as u16; @@ -98,7 +98,7 @@ impl HuffmanDecoder { self.table_insert(i, l, code[l]); code[l] += 1; } - } + }); Ok(()) } diff --git a/src/legacy/shrink.rs b/src/legacy/shrink.rs index 97a7c2766..dcc5db1a8 100644 --- a/src/legacy/shrink.rs +++ b/src/legacy/shrink.rs @@ -32,7 +32,11 @@ impl CodeQueue { // Return the next code in the queue, or INVALID_CODE if the queue is empty. fn next(&self) -> Option { // assert(q->next_idx < sizeof(q->codes) / sizeof(q->codes[0])); - self.codes.get(self.next_idx).flatten() + if let Some(Some(next)) = self.codes.get(self.next_idx) { + Some(*next) + } else { + None + } } /// Return and remove the next code from the queue, or return INVALID_CODE if From cbda19239af8d8c3ae870c4a5752783e782c0426 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Mon, 20 May 2024 05:19:59 +0200 Subject: [PATCH 29/50] Revert "Update src/read.rs" This reverts commit 52263bad73606d7fcacdb712e525ac325c76f6b4. Doesn't work - type differs. --- src/read.rs | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/src/read.rs b/src/read.rs index 40550e0ad..2dfd38185 100644 --- a/src/read.rs +++ b/src/read.rs @@ -198,7 +198,25 @@ impl<'a> ZipFileReader<'a> { ZipFileReader::Raw(r) => r, ZipFileReader::Stored(r) => r.into_inner().into_inner(), #[cfg(feature = "legacy-zip")] - ZipFileReader::Shrink(r) | ZipFileReader::Reduce(r) | ZipFileReader::Implode(r) => { + ZipFileReader::Shrink(r) => { + // Lzma reader owns its buffer rather than mutably borrowing it, so we have to drop + // it separately + if let Ok(mut remaining) = r.into_inner().finish() { + let _ = copy(&mut remaining, &mut sink()); + } + return; + } + #[cfg(feature = "legacy-zip")] + ZipFileReader::Reduce(r) => { + // Lzma reader owns its buffer rather than mutably borrowing it, so we have to drop + // it separately + if let Ok(mut remaining) = r.into_inner().finish() { + let _ = copy(&mut remaining, &mut sink()); + } + return; + } + #[cfg(feature = "legacy-zip")] + ZipFileReader::Implode(r) => { // Lzma reader owns its buffer rather than mutably borrowing it, so we have to drop // it separately if let Ok(mut remaining) = r.into_inner().finish() { From c36d4800bd95e48b769a41c0910d91bfdc691136 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Mon, 20 May 2024 05:29:21 +0200 Subject: [PATCH 30/50] Update src/legacy/reduce.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Chris Hennick <4961925+Pr0methean@users.noreply.github.com> Signed-off-by: Mike Krüger --- src/legacy/reduce.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/legacy/reduce.rs b/src/legacy/reduce.rs index e8e037e16..98a2635a7 100644 --- a/src/legacy/reduce.rs +++ b/src/legacy/reduce.rs @@ -137,7 +137,7 @@ fn hwexpand( // Read the W byte, which together with V gives the distance. curr_byte = read_next_byte(&mut is, curr_byte, &mut fsets)?; - let dist = ((v as usize) >> v_len_bits) * 256 + curr_byte as usize + 1; + let dist = ((v as usize) >> v_len_bits) << 8 + curr_byte as usize + 1; debug_assert!(len <= max_len(comp_factor)); debug_assert!(dist as usize <= max_dist(comp_factor)); From 585cc65a3ce4ade624930f2d31b665feef6356ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Mon, 20 May 2024 05:32:41 +0200 Subject: [PATCH 31/50] Update src/legacy/implode.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Chris Hennick <4961925+Pr0methean@users.noreply.github.com> Signed-off-by: Mike Krüger --- src/legacy/implode.rs | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/legacy/implode.rs b/src/legacy/implode.rs index 57caafb7b..cf65d476d 100644 --- a/src/legacy/implode.rs +++ b/src/legacy/implode.rs @@ -106,6 +106,7 @@ fn hwexplode( 2 } }; + let dist_low_bits = if large_wnd { 7 } else { 6 }; while dst.len() < uncomp_len { let is_literal = is.read_bit()?; if is_literal { @@ -122,15 +123,10 @@ fn hwexplode( } // Read the low dist bits. - let mut dist; - if large_wnd { - dist = is.read::(7)?; - } else { - dist = is.read::(6)?; - } + let mut dist = is.read::(dist_low_bits)?; // Read the Huffman-encoded high dist bits. let sym = dist_decoder.huffman_decode(bit_length, &mut is)?; - dist |= (sym as u16) << if large_wnd { 7 } else { 6 }; + dist |= (sym as u16) << dist_low_bits; dist += 1; // Read the Huffman-encoded len. From d5032a0de4c2df87a7a61a72e53f232af982e5bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Mon, 20 May 2024 05:33:54 +0200 Subject: [PATCH 32/50] Update src/legacy/shrink.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Chris Hennick <4961925+Pr0methean@users.noreply.github.com> Signed-off-by: Mike Krüger --- src/legacy/shrink.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/legacy/shrink.rs b/src/legacy/shrink.rs index dcc5db1a8..7736d690b 100644 --- a/src/legacy/shrink.rs +++ b/src/legacy/shrink.rs @@ -194,10 +194,12 @@ fn output_code( about to: the KwKwK case. Add the previous string extended with its first byte. */ debug_assert!(codetab[prev_code as usize].prefix_code.is_some()); - codetab[prefix_code as usize].prefix_code = Some(prev_code); - codetab[prefix_code as usize].ext_byte = *first_byte; - codetab[prefix_code as usize].len = codetab[prev_code as usize].len + 1; - codetab[prefix_code as usize].last_dst_pos = codetab[prev_code as usize].last_dst_pos; + *(codetab[prefix_code as usize]) = Codetab { + prefix_code: Some(prev_code), + ext_byte: *first_byte, + len: codetab[prev_code as usize].len + 1, + last_dst_pos: codetab[prev_code as usize].last_dst_pos + }; dst.push_back(*first_byte); } else if codetab[prefix_code as usize].prefix_code.is_none() { // The prefix code is still invalid. From f69d2aeac98e49975cf6c33a5feeba400eb7ef14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Mon, 20 May 2024 05:35:23 +0200 Subject: [PATCH 33/50] Update src/legacy/shrink.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Chris Hennick <4961925+Pr0methean@users.noreply.github.com> Signed-off-by: Mike Krüger --- src/legacy/shrink.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/legacy/shrink.rs b/src/legacy/shrink.rs index 7736d690b..2595c70d6 100644 --- a/src/legacy/shrink.rs +++ b/src/legacy/shrink.rs @@ -271,10 +271,12 @@ fn hwunshrink(src: &[u8], uncompressed_size: usize, dst: &mut VecDeque) -> i } // Extend the previous code with its first byte. debug_assert!(curr_code != prev_code); - codetab[curr_code as usize].prefix_code = Some(prev_code); - codetab[curr_code as usize].ext_byte = first_byte; - codetab[curr_code as usize].len = codetab[prev_code as usize].len + 1; - codetab[curr_code as usize].last_dst_pos = codetab[prev_code as usize].last_dst_pos; + *codetab[curr_code] = Codetab { + prefix_code: Some(prev_code), + ext_byte: first_byte, + len: codetab[prev_code as usize].len + 1, + last_dst_pos: codetab[prev_code as usize].last_dst_pos, + }; // dst.push_back(first_byte); } From 91b9e16e2a98542df4956e1673e86123a1edb175 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Mon, 20 May 2024 05:36:20 +0200 Subject: [PATCH 34/50] Update src/legacy/shrink.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Chris Hennick <4961925+Pr0methean@users.noreply.github.com> Signed-off-by: Mike Krüger --- src/legacy/shrink.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/legacy/shrink.rs b/src/legacy/shrink.rs index 2595c70d6..0f78c65ad 100644 --- a/src/legacy/shrink.rs +++ b/src/legacy/shrink.rs @@ -185,9 +185,11 @@ fn output_code( // was invalid (due to partial clearing) when the code was inserted into // the table. The prefix can then become valid when it's added to the // table at a later point. - debug_assert!(codetab[code as usize].len == UNKNOWN_LEN); - let prefix_code = codetab[code as usize].prefix_code.unwrap(); - debug_assert!(prefix_code as usize > CONTROL_CODE); + if cfg!(debug_assertions) { + let tab_entry = codetab[code as usize]; + assert!(tab_entry.len == UNKNOWN_LEN); + assert!(tab_entry.prefix_code.unwrap() as usize > CONTROL_CODE); + } if Some(prefix_code) == queue.next() { /* The prefix code hasn't been added yet, but we were just From 232fbb2eef9a50c95999980720b7fadb6bc2fe89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Mon, 20 May 2024 05:39:23 +0200 Subject: [PATCH 35/50] Update src/legacy/shrink.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Chris Hennick <4961925+Pr0methean@users.noreply.github.com> Signed-off-by: Mike Krüger --- src/legacy/shrink.rs | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/src/legacy/shrink.rs b/src/legacy/shrink.rs index 0f78c65ad..c5f73d0c2 100644 --- a/src/legacy/shrink.rs +++ b/src/legacy/shrink.rs @@ -302,17 +302,21 @@ fn hwunshrink(src: &[u8], uncompressed_size: usize, dst: &mut VecDeque) -> i let new_code = queue.remove_next(); if let Some(new_code) = new_code { //debug_assert!(codetab[prev_code as usize].last_dst_pos < dst_pos); - codetab[new_code as usize].prefix_code = Some(prev_code); - codetab[new_code as usize].ext_byte = first_byte; - codetab[new_code as usize].len = codetab[prev_code as usize].len + 1; - codetab[new_code as usize].last_dst_pos = codetab[prev_code as usize].last_dst_pos; - - if codetab[prev_code as usize].prefix_code.is_none() { - // prev_code was invalidated in a partial - // clearing. Until that code is re-used, the - // string represented by new_code is - // indeterminate. - codetab[new_code as usize].len = UNKNOWN_LEN; + let prev_code_entry = codetab[prev_code as usize]; + *codetab[new_code as usize] = Codetab { + prefix_code: Some(prev_code), + ext_byte: first_byte, + last_dst_pos: prev_code_entry.last_dst_pos, + len: if prev_code_entry.prefix_code.is_none() { + // prev_code was invalidated in a partial + // clearing. Until that code is re-used, the + // string represented by new_code is + // indeterminate. + UNKNOWN_LEN + } else { + prev_code_entry.len + 1; + }, + }; } // If prev_code was invalidated in a partial clearing, // it's possible that new_code==prev_code, in which From b5ea5576db6fa7ec9b07bc4f595fcc25e40dd510 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Mon, 20 May 2024 05:40:11 +0200 Subject: [PATCH 36/50] Some code cleanups. --- src/legacy/huffman.rs | 2 +- src/legacy/lz77.rs | 2 +- src/legacy/shrink.rs | 14 +++++--------- 3 files changed, 7 insertions(+), 11 deletions(-) diff --git a/src/legacy/huffman.rs b/src/legacy/huffman.rs index acae2d8ee..fd9756214 100644 --- a/src/legacy/huffman.rs +++ b/src/legacy/huffman.rs @@ -86,7 +86,7 @@ impl HuffmanDecoder { // Build mapping from index to symbol and populate the lookup table. lengths.iter().enumerate().take(n).for_each(|(i, n)| { - let l = lengths[i] as usize; + let l = *n as usize; if l == 0 { return; } diff --git a/src/legacy/lz77.rs b/src/legacy/lz77.rs index ae3134aa3..939df9b90 100644 --- a/src/legacy/lz77.rs +++ b/src/legacy/lz77.rs @@ -1,7 +1,7 @@ use std::collections::VecDeque; /// Output the (dist,len) back reference at dst_pos in dst. -pub fn lz77_output_backref(dst: &mut VecDeque, dist: usize, len: usize) { +pub(crate) fn lz77_output_backref(dst: &mut VecDeque, dist: usize, len: usize) { // debug_assert!(dist <= dst_pos, "cannot reference before beginning of dst"); for _ in 0..len { diff --git a/src/legacy/shrink.rs b/src/legacy/shrink.rs index c5f73d0c2..bd23449a9 100644 --- a/src/legacy/shrink.rs +++ b/src/legacy/shrink.rs @@ -185,6 +185,7 @@ fn output_code( // was invalid (due to partial clearing) when the code was inserted into // the table. The prefix can then become valid when it's added to the // table at a later point. + let prefix_code = codetab[code as usize].prefix_code.unwrap(); if cfg!(debug_assertions) { let tab_entry = codetab[code as usize]; assert!(tab_entry.len == UNKNOWN_LEN); @@ -195,8 +196,7 @@ fn output_code( /* The prefix code hasn't been added yet, but we were just about to: the KwKwK case. Add the previous string extended with its first byte. */ - debug_assert!(codetab[prev_code as usize].prefix_code.is_some()); - *(codetab[prefix_code as usize]) = Codetab { + codetab[prefix_code as usize] = Codetab { prefix_code: Some(prev_code), ext_byte: *first_byte, len: codetab[prev_code as usize].len + 1, @@ -273,7 +273,7 @@ fn hwunshrink(src: &[u8], uncompressed_size: usize, dst: &mut VecDeque) -> i } // Extend the previous code with its first byte. debug_assert!(curr_code != prev_code); - *codetab[curr_code] = Codetab { + codetab[curr_code as usize] = Codetab { prefix_code: Some(prev_code), ext_byte: first_byte, len: codetab[prev_code as usize].len + 1, @@ -303,7 +303,7 @@ fn hwunshrink(src: &[u8], uncompressed_size: usize, dst: &mut VecDeque) -> i if let Some(new_code) = new_code { //debug_assert!(codetab[prev_code as usize].last_dst_pos < dst_pos); let prev_code_entry = codetab[prev_code as usize]; - *codetab[new_code as usize] = Codetab { + codetab[new_code as usize] = Codetab { prefix_code: Some(prev_code), ext_byte: first_byte, last_dst_pos: prev_code_entry.last_dst_pos, @@ -314,13 +314,9 @@ fn hwunshrink(src: &[u8], uncompressed_size: usize, dst: &mut VecDeque) -> i // indeterminate. UNKNOWN_LEN } else { - prev_code_entry.len + 1; + prev_code_entry.len + 1 }, }; - } - // If prev_code was invalidated in a partial clearing, - // it's possible that new_code==prev_code, in which - // case it will never be used or cleared. } codetab[curr_code as usize].last_dst_pos = dst_pos; From b86fd937af874a75f1436f573b587790d7d50eeb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Mon, 20 May 2024 06:02:43 +0200 Subject: [PATCH 37/50] Fixed failing unit test. --- src/legacy/reduce.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/legacy/reduce.rs b/src/legacy/reduce.rs index 98a2635a7..5b09570a5 100644 --- a/src/legacy/reduce.rs +++ b/src/legacy/reduce.rs @@ -137,7 +137,7 @@ fn hwexpand( // Read the W byte, which together with V gives the distance. curr_byte = read_next_byte(&mut is, curr_byte, &mut fsets)?; - let dist = ((v as usize) >> v_len_bits) << 8 + curr_byte as usize + 1; + let dist = (((v as usize) >> v_len_bits) << 8) + curr_byte as usize + 1; debug_assert!(len <= max_len(comp_factor)); debug_assert!(dist as usize <= max_dist(comp_factor)); From 3698198dec8041b613d96b07abad90e65dd6ab91 Mon Sep 17 00:00:00 2001 From: Chris Hennick <4961925+Pr0methean@users.noreply.github.com> Date: Sun, 19 May 2024 21:04:44 -0700 Subject: [PATCH 38/50] Fix reuse of name `n` in src/legacy/huffman.rs Signed-off-by: Chris Hennick <4961925+Pr0methean@users.noreply.github.com> --- src/legacy/huffman.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/legacy/huffman.rs b/src/legacy/huffman.rs index fd9756214..6e227dcdd 100644 --- a/src/legacy/huffman.rs +++ b/src/legacy/huffman.rs @@ -85,8 +85,8 @@ impl HuffmanDecoder { } // Build mapping from index to symbol and populate the lookup table. - lengths.iter().enumerate().take(n).for_each(|(i, n)| { - let l = *n as usize; + lengths.iter().enumerate().take(n).for_each(|(i, code_len)| { + let l = *code_len as usize; if l == 0 { return; } From 9d60e97070a16568f71f5a9054f70c66eacf26cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Mon, 20 May 2024 06:10:15 +0200 Subject: [PATCH 39/50] Added lsb tests/refactored to u8. --- src/legacy/huffman.rs | 10 ++++++++-- src/legacy/mod.rs | 12 ------------ src/legacy/reduce.rs | 24 +++++++++++++++++++----- src/legacy/shrink.rs | 2 +- 4 files changed, 28 insertions(+), 20 deletions(-) diff --git a/src/legacy/huffman.rs b/src/legacy/huffman.rs index 6e227dcdd..040e1607d 100644 --- a/src/legacy/huffman.rs +++ b/src/legacy/huffman.rs @@ -2,8 +2,6 @@ use std::io::{self, Error, Seek}; use bitstream_io::{BitRead, BitReader, Endianness}; -use crate::legacy::reverse_lsb; - #[derive(Default, Clone, Copy)] pub struct TableEntry { /// Wide enough to fit the max symbol nbr. @@ -44,6 +42,14 @@ impl Default for HuffmanDecoder { } } +/// Reverse the n least significant bits of x. +/// The (16 - n) most significant bits of the result will be zero. +pub fn reverse_lsb(x: u16, n: usize) -> u16 { + debug_assert!(n > 0); + debug_assert!(n <= 16); + x.reverse_bits() >> (16 - n) +} + /// Initialize huffman decoder d for a code defined by the n codeword lengths. /// Returns false if the codeword lengths do not correspond to a valid prefix /// code. diff --git a/src/legacy/mod.rs b/src/legacy/mod.rs index f86fe77eb..15897e5ad 100644 --- a/src/legacy/mod.rs +++ b/src/legacy/mod.rs @@ -6,15 +6,3 @@ pub mod reduce; pub use reduce::*; pub mod implode; pub use implode::*; -/// Reverse the n least significant bits of x. -/// The (16 - n) most significant bits of the result will be zero. -pub fn reverse_lsb(x: u16, n: usize) -> u16 { - debug_assert!(n > 0); - debug_assert!(n <= 16); - x.reverse_bits() >> (16 - n) -} -/// Get the n least significant bits of x. -pub fn lsb(x: u64, n: u8) -> u64 { - debug_assert!(n <= 63); - x & ((1u64 << (n as u32)) - 1) -} diff --git a/src/legacy/reduce.rs b/src/legacy/reduce.rs index 5b09570a5..171a3050e 100644 --- a/src/legacy/reduce.rs +++ b/src/legacy/reduce.rs @@ -1,10 +1,8 @@ use std::collections::VecDeque; use std::io::{self, copy, Read, Result}; -use bitstream_io::{BitRead, BitReader, Endianness, LittleEndian}; - -use crate::legacy::lsb; use crate::legacy::lz77::lz77_output_backref; +use bitstream_io::{BitRead, BitReader, Endianness, LittleEndian}; /// Number of bits used to represent indices in a follower set of size n. fn follower_idx_bw(n: u8) -> u8 { @@ -93,6 +91,14 @@ fn max_dist(comp_factor: u8) -> usize { const DLE_BYTE: u8 = 0x90; +/// Get the n least significant bits of x. +fn lsb(x: u8, n: u8) -> u8 { + if n >= 8 { + return x; + } + x & ((1 << n) - 1) +} + fn hwexpand( src: &[u8], uncomp_len: usize, @@ -127,7 +133,7 @@ fn hwexpand( continue; } let v = curr_byte; - let mut len = lsb(v as u64, v_len_bits) as usize; + let mut len = lsb(v, v_len_bits) as usize; if len == (1 << v_len_bits) - 1 { // Read an extra length byte. curr_byte = read_next_byte(&mut is, curr_byte, &mut fsets)?; @@ -210,10 +216,18 @@ impl Read for ReduceDecoder { #[cfg(test)] mod tests { use super::hwexpand; - use crate::legacy::reduce::{follower_idx_bw, max_dist}; + use crate::legacy::reduce::{follower_idx_bw, lsb, max_dist}; use std::collections::VecDeque; const HAMLET_2048: &[u8; 1285] = include_bytes!("../../tests/reduce_hamlet_2048.bin"); + #[test] + fn test_lsb() { + assert_eq!(lsb(0xFF, 8), 0xFF); + for i in 0..7 { + assert_eq!(lsb(0xFF, i), (1 << i) - 1); + } + } + #[test] fn test_expand_hamlet2048() { let mut dst = VecDeque::new(); diff --git a/src/legacy/shrink.rs b/src/legacy/shrink.rs index bd23449a9..5a7f4643c 100644 --- a/src/legacy/shrink.rs +++ b/src/legacy/shrink.rs @@ -200,7 +200,7 @@ fn output_code( prefix_code: Some(prev_code), ext_byte: *first_byte, len: codetab[prev_code as usize].len + 1, - last_dst_pos: codetab[prev_code as usize].last_dst_pos + last_dst_pos: codetab[prev_code as usize].last_dst_pos, }; dst.push_back(*first_byte); } else if codetab[prefix_code as usize].prefix_code.is_none() { From 1936dd65cdd84379d760fa51d7edd988f0d152f5 Mon Sep 17 00:00:00 2001 From: Chris Hennick <4961925+Pr0methean@users.noreply.github.com> Date: Sun, 19 May 2024 21:17:59 -0700 Subject: [PATCH 40/50] style: Fix cargo-fmt complaints in huffman.rs Signed-off-by: Chris Hennick <4961925+Pr0methean@users.noreply.github.com> --- src/legacy/huffman.rs | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/src/legacy/huffman.rs b/src/legacy/huffman.rs index 040e1607d..0f582b893 100644 --- a/src/legacy/huffman.rs +++ b/src/legacy/huffman.rs @@ -91,20 +91,24 @@ impl HuffmanDecoder { } // Build mapping from index to symbol and populate the lookup table. - lengths.iter().enumerate().take(n).for_each(|(i, code_len)| { - let l = *code_len as usize; - if l == 0 { - return; - } - - self.syms[sym_idx[l] as usize] = i as u16; - sym_idx[l] += 1; - - if l <= HUFFMAN_LOOKUP_TABLE_BITS as usize { - self.table_insert(i, l, code[l]); - code[l] += 1; - } - }); + lengths + .iter() + .enumerate() + .take(n) + .for_each(|(i, code_len)| { + let l = *code_len as usize; + if l == 0 { + return; + } + + self.syms[sym_idx[l] as usize] = i as u16; + sym_idx[l] += 1; + + if l <= HUFFMAN_LOOKUP_TABLE_BITS as usize { + self.table_insert(i, l, code[l]); + code[l] += 1; + } + }); Ok(()) } From 3c60457dc1992391b2db935d97fd95b9ff41c511 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Mon, 20 May 2024 06:18:04 +0200 Subject: [PATCH 41/50] Use follower set fixed size array. --- src/legacy/reduce.rs | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/src/legacy/reduce.rs b/src/legacy/reduce.rs index 171a3050e..dc9b14497 100644 --- a/src/legacy/reduce.rs +++ b/src/legacy/reduce.rs @@ -22,10 +22,12 @@ struct FollowerSet { } /// Read the follower sets from is into fsets. Returns true on success. +type FollowerSetArray = [FollowerSet; u8::MAX as usize + 1]; + fn read_follower_sets( is: &mut BitReader, - fsets: &mut [FollowerSet], -) -> io::Result<()> { +) -> io::Result { + let mut fsets = [FollowerSet::default(); u8::MAX as usize + 1]; for i in (0..=u8::MAX as usize).rev() { let n = is.read::(6)?; if n > 32 { @@ -42,7 +44,7 @@ fn read_follower_sets( } } - Ok(()) + Ok(fsets) } /// Read the next byte from is, decoded based on prev_byte and the follower sets. @@ -51,15 +53,10 @@ fn read_follower_sets( fn read_next_byte( is: &mut BitReader, prev_byte: u8, - fsets: &mut [FollowerSet], + fsets: &mut FollowerSetArray, ) -> io::Result { - if fsets[prev_byte as usize].size == 0 { - // No followers; read a literal byte. - return Ok(is.read::(8)?); - } - - if is.read::(1)? == 1 { - // Don't use the follower set; read a literal byte. + if fsets[prev_byte as usize].size == 0 // No followers + || is.read::(1)? == 1 {// Indicates next symbol is a literal byte return Ok(is.read::(8)?); } @@ -105,11 +102,10 @@ fn hwexpand( comp_factor: u8, dst: &mut VecDeque, ) -> io::Result<()> { - let mut fsets = [FollowerSet::default(); 1 << 8]; debug_assert!(comp_factor >= 1 && comp_factor <= 4); let mut is = BitReader::endian(src, LittleEndian); - read_follower_sets(&mut is, &mut fsets)?; + let mut fsets = read_follower_sets(&mut is)?; // Number of bits in V used for backref length. let v_len_bits = 8 - comp_factor; From 415d7de49602c79f221357c8c68a4bca42453ce0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Mon, 20 May 2024 06:19:26 +0200 Subject: [PATCH 42/50] Update src/legacy/reduce.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Chris Hennick <4961925+Pr0methean@users.noreply.github.com> Signed-off-by: Mike Krüger --- src/legacy/reduce.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/legacy/reduce.rs b/src/legacy/reduce.rs index dc9b14497..af5eb8608 100644 --- a/src/legacy/reduce.rs +++ b/src/legacy/reduce.rs @@ -284,7 +284,7 @@ mod tests { fn test_max_dist() { for i in 1..=4 { let v_dist_bits = i as usize; - let c = ((1 << v_dist_bits) - 1) * 256 + 255 + 1; + let c = 1 << (v_dist_bits + 8); assert_eq!(max_dist(i), c); } } From 2dbfc99a63fe16abf04887dc64331eeb48918fb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Mon, 20 May 2024 06:20:04 +0200 Subject: [PATCH 43/50] Update src/legacy/shrink.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Chris Hennick <4961925+Pr0methean@users.noreply.github.com> Signed-off-by: Mike Krüger --- src/legacy/shrink.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/legacy/shrink.rs b/src/legacy/shrink.rs index 5a7f4643c..5c08e8ea1 100644 --- a/src/legacy/shrink.rs +++ b/src/legacy/shrink.rs @@ -284,17 +284,14 @@ fn hwunshrink(src: &[u8], uncompressed_size: usize, dst: &mut VecDeque) -> i // Output the string represented by the current code. let mut len = 0; - if let Err(s) = output_code( + output_code( curr_code, dst, prev_code, &mut codetab, &mut queue, &mut first_byte, - &mut len, - ) { - return Err(s); - } + &mut len)?; // Add a new code to the string table if there's room. // The string is the previous code's string extended with From 9aa2f1e416dc96f403a1ec1b0fd2a4725ea23fa5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Mon, 20 May 2024 06:38:39 +0200 Subject: [PATCH 44/50] Code cleanups. --- src/legacy/huffman.rs | 4 ++-- src/legacy/implode.rs | 24 ++++++++++++------------ src/legacy/reduce.rs | 4 +++- 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/src/legacy/huffman.rs b/src/legacy/huffman.rs index 0f582b893..6db9fd938 100644 --- a/src/legacy/huffman.rs +++ b/src/legacy/huffman.rs @@ -100,10 +100,10 @@ impl HuffmanDecoder { if l == 0 { return; } - + self.syms[sym_idx[l] as usize] = i as u16; sym_idx[l] += 1; - + if l <= HUFFMAN_LOOKUP_TABLE_BITS as usize { self.table_insert(i, l, code[l]); code[l] += 1; diff --git a/src/legacy/implode.rs b/src/legacy/implode.rs index cf65d476d..54b43d0a9 100644 --- a/src/legacy/implode.rs +++ b/src/legacy/implode.rs @@ -9,8 +9,7 @@ use std::io::{self, copy, Cursor, Error, Read, Result}; fn read_huffman_code( is: &mut BitReader, num_lens: usize, - d: &mut HuffmanDecoder, -) -> std::io::Result<()> { +) -> std::io::Result { let mut lens = [0; 1 << 8]; let mut len_count = [0; 17]; // debug_assert!(num_lens <= sizeof(lens) / sizeof(lens[0])); @@ -72,7 +71,9 @@ fn read_huffman_code( )); } - d.init(&lens, num_lens) + let mut d = HuffmanDecoder::default(); + d.init(&lens, num_lens)?; + Ok(d) } fn hwexplode( @@ -85,14 +86,13 @@ fn hwexplode( ) -> std::io::Result<()> { let bit_length = src.len() as u64 * 8; let mut is = BitReader::endian(Cursor::new(&src), LittleEndian); - let mut lit_decoder = HuffmanDecoder::default(); - let mut len_decoder = HuffmanDecoder::default(); - let mut dist_decoder = HuffmanDecoder::default(); - if lit_tree { - read_huffman_code(&mut is, 256, &mut lit_decoder)?; - } - read_huffman_code(&mut is, 64, &mut len_decoder)?; - read_huffman_code(&mut is, 64, &mut dist_decoder)?; + let mut lit_decoder_opt = if lit_tree { + Some(read_huffman_code(&mut is, 256)?) + } else { + None + }; + let mut len_decoder = read_huffman_code(&mut is, 64)?; + let mut dist_decoder = read_huffman_code(&mut is, 64)?; let min_len = if pk101_bug_compat { if large_wnd { 3 @@ -112,7 +112,7 @@ fn hwexplode( if is_literal { // Literal. let sym; - if lit_tree { + if let Some(lit_decoder) = &mut lit_decoder_opt { sym = lit_decoder.huffman_decode(bit_length, &mut is)?; } else { sym = is.read::(8)? as u16; diff --git a/src/legacy/reduce.rs b/src/legacy/reduce.rs index af5eb8608..6047be660 100644 --- a/src/legacy/reduce.rs +++ b/src/legacy/reduce.rs @@ -56,7 +56,9 @@ fn read_next_byte( fsets: &mut FollowerSetArray, ) -> io::Result { if fsets[prev_byte as usize].size == 0 // No followers - || is.read::(1)? == 1 {// Indicates next symbol is a literal byte + || is.read::(1)? == 1 // Indicates next symbol is a literal byte + { + return Ok(is.read::(8)?); } From 5d3654e82a0f4025d2d7a0d7b02b018be3e45eea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Tue, 21 May 2024 10:48:00 +0200 Subject: [PATCH 45/50] Update src/legacy/shrink.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Chris Hennick <4961925+Pr0methean@users.noreply.github.com> Signed-off-by: Mike Krüger --- src/legacy/shrink.rs | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/src/legacy/shrink.rs b/src/legacy/shrink.rs index 5c08e8ea1..769601ac2 100644 --- a/src/legacy/shrink.rs +++ b/src/legacy/shrink.rs @@ -115,22 +115,21 @@ fn read_code( code_size: &mut u8, codetab: &mut [Codetab], queue: &mut CodeQueue, -) -> io::Result> { +) -> io::Result { // assert(sizeof(code) * CHAR_BIT >= *code_size); let code = is.read::(*code_size as u32)?; // Handle regular codes (the common case). if code != CONTROL_CODE as u16 { - return Ok(Some(code)); + return Ok(code); } // Handle control codes. - let control_code = if let Ok(c) = is.read::(*code_size as u32) { - c - } else { - return Ok(None); - }; - if control_code == INC_CODE_SIZE && *code_size < MAX_CODE_SIZE { + let control_code = is.read::(*code_size as u32)?; + if control_code == INC_CODE_SIZE { + if *code_size >= MAX_CODE_SIZE { + return Err(io::Error::new(ErrorKind::InvalidData, "tried to increase code size when already at maximum)); + } (*code_size) += 1; return read_code(is, code_size, codetab, queue); } @@ -138,7 +137,7 @@ fn read_code( unshrink_partial_clear(codetab, queue); return read_code(is, code_size, codetab, queue); } - return Ok(None); + Err(io::Error::new(ErrorKind::InvalidData, format!("Invalid control code {}", control_code))) } /// Output the string represented by a code into dst at dst_pos. Returns From 43771767b51aff8222dfd0e032fed959c43bd4c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Tue, 21 May 2024 10:48:45 +0200 Subject: [PATCH 46/50] Update src/legacy/implode.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Chris Hennick <4961925+Pr0methean@users.noreply.github.com> Signed-off-by: Mike Krüger --- src/legacy/implode.rs | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/src/legacy/implode.rs b/src/legacy/implode.rs index 54b43d0a9..2a8c622f6 100644 --- a/src/legacy/implode.rs +++ b/src/legacy/implode.rs @@ -93,18 +93,12 @@ fn hwexplode( }; let mut len_decoder = read_huffman_code(&mut is, 64)?; let mut dist_decoder = read_huffman_code(&mut is, 64)?; - let min_len = if pk101_bug_compat { - if large_wnd { - 3 - } else { - 2 - } + let min_len = if pk101_bug_compat && large_wnd { + 3 + } else if !pk101_bug_compat && lit_tree { + 3 } else { - if lit_tree { - 3 - } else { - 2 - } + 2 }; let dist_low_bits = if large_wnd { 7 } else { 6 }; while dst.len() < uncomp_len { From 885d17ccdede32a3b284b3b009250ab651f562a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Tue, 21 May 2024 10:52:36 +0200 Subject: [PATCH 47/50] Revert "Update src/legacy/shrink.rs" This reverts commit 5d3654e82a0f4025d2d7a0d7b02b018be3e45eea. --- src/legacy/shrink.rs | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/legacy/shrink.rs b/src/legacy/shrink.rs index 769601ac2..5c08e8ea1 100644 --- a/src/legacy/shrink.rs +++ b/src/legacy/shrink.rs @@ -115,21 +115,22 @@ fn read_code( code_size: &mut u8, codetab: &mut [Codetab], queue: &mut CodeQueue, -) -> io::Result { +) -> io::Result> { // assert(sizeof(code) * CHAR_BIT >= *code_size); let code = is.read::(*code_size as u32)?; // Handle regular codes (the common case). if code != CONTROL_CODE as u16 { - return Ok(code); + return Ok(Some(code)); } // Handle control codes. - let control_code = is.read::(*code_size as u32)?; - if control_code == INC_CODE_SIZE { - if *code_size >= MAX_CODE_SIZE { - return Err(io::Error::new(ErrorKind::InvalidData, "tried to increase code size when already at maximum)); - } + let control_code = if let Ok(c) = is.read::(*code_size as u32) { + c + } else { + return Ok(None); + }; + if control_code == INC_CODE_SIZE && *code_size < MAX_CODE_SIZE { (*code_size) += 1; return read_code(is, code_size, codetab, queue); } @@ -137,7 +138,7 @@ fn read_code( unshrink_partial_clear(codetab, queue); return read_code(is, code_size, codetab, queue); } - Err(io::Error::new(ErrorKind::InvalidData, format!("Invalid control code {}", control_code))) + return Ok(None); } /// Output the string represented by a code into dst at dst_pos. Returns From fb948ee1495231a82940518e1473ebf008d02d87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Tue, 21 May 2024 10:56:07 +0200 Subject: [PATCH 48/50] Fix build. --- .gitattributes | 3 +++ src/legacy/reduce.rs | 4 ++-- src/legacy/shrink.rs | 37 ++++++++++++++++++++++++------------- 3 files changed, 29 insertions(+), 15 deletions(-) create mode 100644 .gitattributes diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 000000000..7e23dfbb9 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,3 @@ +tests/implode_hamlet_256.bin binary +tests/reduce_hamlet_2048.bin binary +tests/reduce_zero_reduced.bin binary diff --git a/src/legacy/reduce.rs b/src/legacy/reduce.rs index 6047be660..56fe82948 100644 --- a/src/legacy/reduce.rs +++ b/src/legacy/reduce.rs @@ -56,9 +56,9 @@ fn read_next_byte( fsets: &mut FollowerSetArray, ) -> io::Result { if fsets[prev_byte as usize].size == 0 // No followers - || is.read::(1)? == 1 // Indicates next symbol is a literal byte + || is.read::(1)? == 1 + // Indicates next symbol is a literal byte { - return Ok(is.read::(8)?); } diff --git a/src/legacy/shrink.rs b/src/legacy/shrink.rs index 5c08e8ea1..7f9fc7bd4 100644 --- a/src/legacy/shrink.rs +++ b/src/legacy/shrink.rs @@ -125,20 +125,30 @@ fn read_code( } // Handle control codes. - let control_code = if let Ok(c) = is.read::(*code_size as u32) { - c - } else { - return Ok(None); - }; - if control_code == INC_CODE_SIZE && *code_size < MAX_CODE_SIZE { - (*code_size) += 1; - return read_code(is, code_size, codetab, queue); - } - if control_code == PARTIAL_CLEAR { - unshrink_partial_clear(codetab, queue); + if let Ok(control_code) = is.read::(*code_size as u32) { + match control_code { + INC_CODE_SIZE => { + if *code_size >= MAX_CODE_SIZE { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "tried to increase code size when already at maximum", + )); + } + *code_size += 1; + } + PARTIAL_CLEAR => { + unshrink_partial_clear(codetab, queue); + } + _ => { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!("Invalid control code {}", control_code), + )); + } + } return read_code(is, code_size, codetab, queue); } - return Ok(None); + Ok(None) } /// Output the string represented by a code into dst at dst_pos. Returns @@ -291,7 +301,8 @@ fn hwunshrink(src: &[u8], uncompressed_size: usize, dst: &mut VecDeque) -> i &mut codetab, &mut queue, &mut first_byte, - &mut len)?; + &mut len, + )?; // Add a new code to the string table if there's room. // The string is the previous code's string extended with From a2f73e8e84c876a958b6a1f9e67bff46e52064e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Tue, 21 May 2024 11:22:21 +0200 Subject: [PATCH 49/50] Use Vec in CodeQueue. --- src/legacy/shrink.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/legacy/shrink.rs b/src/legacy/shrink.rs index 7f9fc7bd4..0b8d36ef5 100644 --- a/src/legacy/shrink.rs +++ b/src/legacy/shrink.rs @@ -17,12 +17,12 @@ const UNKNOWN_LEN: u16 = u16::MAX; struct CodeQueue { next_idx: usize, - codes: [Option; MAX_CODE as usize - CONTROL_CODE + 1], + codes: Vec>, } impl CodeQueue { fn new() -> Self { - let mut codes = [None; MAX_CODE as usize - CONTROL_CODE + 1]; + let mut codes = vec![None; MAX_CODE as usize - CONTROL_CODE + 1]; for (i, code) in (CONTROL_CODE as u16 + 1..=MAX_CODE as u16).enumerate() { codes[i] = Some(code); } From ab2a9b15faa99af4d8acc202bf4f289116c939f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20Kr=C3=BCger?= Date: Wed, 22 May 2024 12:48:46 +0200 Subject: [PATCH 50/50] Update src/legacy/shrink.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Chris Hennick <4961925+Pr0methean@users.noreply.github.com> Signed-off-by: Mike Krüger --- src/legacy/shrink.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/legacy/shrink.rs b/src/legacy/shrink.rs index 0b8d36ef5..f9e8a2d67 100644 --- a/src/legacy/shrink.rs +++ b/src/legacy/shrink.rs @@ -52,10 +52,10 @@ impl CodeQueue { #[derive(Clone, Debug, Copy)] struct Codetab { + last_dst_pos: usize, prefix_code: Option, - ext_byte: u8, len: u16, - last_dst_pos: usize, + ext_byte: u8, } impl Default for Codetab {