From ab891f17cfabd07fba7d0749dbab8addc2bb3984 Mon Sep 17 00:00:00 2001 From: Jonathan Giddy Date: Tue, 8 Nov 2022 11:04:38 +0000 Subject: [PATCH] Add write::MultiGzDecoder for multi-member gzip data --- src/gz/write.rs | 128 ++++++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 1 + 2 files changed, 129 insertions(+) diff --git a/src/gz/write.rs b/src/gz/write.rs index 7cf1a7cd..d6ec27c4 100644 --- a/src/gz/write.rs +++ b/src/gz/write.rs @@ -373,6 +373,112 @@ impl Read for GzDecoder { } } +/// A gzip streaming decoder that decodes all members of a multistream +/// +/// A gzip member consists of a header, compressed data and a trailer. The [gzip +/// specification](https://tools.ietf.org/html/rfc1952), however, allows multiple +/// gzip members to be joined in a single stream. `MultiGzDecoder` will +/// decode all consecutive members while `GzDecoder` will only decompress +/// the first gzip member. The multistream format is commonly used in +/// bioinformatics, for example when using the BGZF compressed data. +/// +/// This structure exposes a [`Write`] interface that will consume all gzip members +/// from the written buffers and write uncompressed data to the writer. +#[derive(Debug)] +pub struct MultiGzDecoder { + inner: GzDecoder, +} + +impl MultiGzDecoder { + /// Creates a new decoder which will write uncompressed data to the stream. + /// If the gzip stream contains multiple members all will be decoded. + pub fn new(w: W) -> MultiGzDecoder { + MultiGzDecoder { + inner: GzDecoder::new(w), + } + } + + /// Returns the header associated with the current member. + pub fn header(&self) -> Option<&GzHeader> { + self.inner.header() + } + + /// Acquires a reference to the underlying writer. + pub fn get_ref(&self) -> &W { + self.inner.get_ref() + } + + /// Acquires a mutable reference to the underlying writer. + /// + /// Note that mutating the output/input state of the stream may corrupt this + /// object, so care must be taken when using this method. + pub fn get_mut(&mut self) -> &mut W { + self.inner.get_mut() + } + + /// Attempt to finish this output stream, writing out final chunks of data. + /// + /// Note that this function can only be used once data has finished being + /// written to the output stream. After this function is called then further + /// calls to `write` may result in a panic. + /// + /// # Panics + /// + /// Attempts to write data to this stream may result in a panic after this + /// function is called. + /// + /// # Errors + /// + /// This function will perform I/O to finish the stream, returning any + /// errors which happen. + pub fn try_finish(&mut self) -> io::Result<()> { + self.inner.try_finish() + } + + /// Consumes this decoder, flushing the output stream. + /// + /// This will flush the underlying data stream and then return the contained + /// writer if the flush succeeded. + /// + /// Note that this function may not be suitable to call in a situation where + /// the underlying stream is an asynchronous I/O stream. To finish a stream + /// the `try_finish` (or `shutdown`) method should be used instead. To + /// re-acquire ownership of a stream it is safe to call this method after + /// `try_finish` or `shutdown` has returned `Ok`. + /// + /// # Errors + /// + /// This function will perform I/O to complete this stream, and any I/O + /// errors which occur will be returned from this function. + pub fn finish(self) -> io::Result { + self.inner.finish() + } +} + +impl Write for MultiGzDecoder { + fn write(&mut self, buf: &[u8]) -> io::Result { + if buf.is_empty() { + Ok(0) + } else { + match self.inner.write(buf) { + Ok(0) => { + // When the GzDecoder indicates that it has finished + // create a new GzDecoder to handle additional data. + self.inner.try_finish()?; + let w = self.inner.inner.take_inner().into_inner(); + self.inner = GzDecoder::new(w); + self.inner.write(buf) + } + res => res, + } + } + } + + fn flush(&mut self) -> io::Result<()> { + self.inner.flush() + } +} + #[cfg(test)] mod tests { use super::*; @@ -447,4 +553,26 @@ mod tests { let return_string = String::from_utf8(writer).expect("String parsing error"); assert_eq!(return_string, STR); } + + // Two or more gzip files concatenated form a multi-member gzip file. MultiGzDecoder will + // concatenate the decoded contents of all members. + #[test] + fn decode_multi_writer() { + let mut e = GzEncoder::new(Vec::new(), Compression::default()); + e.write(STR.as_ref()).unwrap(); + let bytes = e.finish().unwrap().repeat(2); + + let mut writer = Vec::new(); + let mut decoder = MultiGzDecoder::new(writer); + let mut count = 0; + while count < bytes.len() { + let n = decoder.write(&bytes[count..]).unwrap(); + assert!(n != 0); + count += n; + } + writer = decoder.finish().unwrap(); + let return_string = String::from_utf8(writer).expect("String parsing error"); + let expected = STR.repeat(2); + assert_eq!(return_string, expected); + } } diff --git a/src/lib.rs b/src/lib.rs index 23a783e5..64af355c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -115,6 +115,7 @@ pub mod write { pub use crate::deflate::write::DeflateEncoder; pub use crate::gz::write::GzDecoder; pub use crate::gz::write::GzEncoder; + pub use crate::gz::write::MultiGzDecoder; pub use crate::zlib::write::ZlibDecoder; pub use crate::zlib::write::ZlibEncoder; }