Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

parallel/pipelined extraction #208

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ lzma-rs = { version = "0.3.0", default-features = false, optional = true }
[target.'cfg(any(all(target_arch = "arm", target_pointer_width = "32"), target_arch = "mips", target_arch = "powerpc"))'.dependencies]
crossbeam-utils = "0.8.20"

[target.'cfg(unix)'.dependencies]
libc = { version = "0.2.155", optional = true }

[target.'cfg(fuzzing)'.dependencies]
arbitrary = { version = "1.3.2", features = ["derive"] }

Expand All @@ -63,6 +66,7 @@ time = { workspace = true, features = ["formatting", "macros"] }
anyhow = "1"
clap = { version = "=4.4.18", features = ["derive"] }
tempdir = "0.3.7"
tempfile = "3.10.1"

[features]
aes-crypto = ["aes", "constant_time_eq", "hmac", "pbkdf2", "sha1", "rand", "zeroize"]
Expand All @@ -79,6 +83,7 @@ deflate-zopfli = ["zopfli", "_deflate-any"]
lzma = ["lzma-rs/stream"]
unreserved = []
xz = ["lzma-rs/raw_decoder"]
parallelism = ["libc"]
default = [
"aes-crypto",
"bzip2",
Expand All @@ -101,3 +106,7 @@ harness = false
[[bench]]
name = "merge_archive"
harness = false

[[bench]]
name = "extract"
harness = false
86 changes: 86 additions & 0 deletions benches/extract.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
use bencher::{benchmark_group, benchmark_main};

use bencher::Bencher;
use tempdir::TempDir;

use std::fs;
use std::path::Path;

use zip::result::ZipResult;
use zip::ZipArchive;

#[cfg(all(feature = "parallelism", unix))]
cosmicexplorer marked this conversation as resolved.
Show resolved Hide resolved
use zip::read::{split_extract, ExtractionParameters};

/* This archive has a set of entries repeated 20x:
* - 200K random data, stored uncompressed (CompressionMethod::Stored)
* - 246K text data (the project gutenberg html version of king lear)
* (CompressionMethod::Bzip2, compression level 1) (project gutenberg ebooks are public domain)
*
* The full archive file is 5.3MB.
*/
fn get_test_archive() -> ZipResult<ZipArchive<fs::File>> {
let path =
Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/data/stored-and-compressed-text.zip");
let file = fs::File::open(path)?;
ZipArchive::new(file)
}

fn extract_basic(bench: &mut Bencher) {
let mut readable_archive = get_test_archive().unwrap();
let total_size: u64 = readable_archive
.decompressed_size()
.unwrap()
.try_into()
.unwrap();

let parent = TempDir::new("zip-extract").unwrap();

bench.bytes = total_size;
bench.bench_n(1, |bench| {
bench.iter(move || {
let outdir = TempDir::new_in(parent.path(), "bench-subdir")
.unwrap()
.into_path();
readable_archive.extract(outdir).unwrap();
});
});
}

#[cfg(all(feature = "parallelism", unix))]
const DECOMPRESSION_THREADS: usize = 8;

#[cfg(all(feature = "parallelism", unix))]
fn extract_split(bench: &mut Bencher) {
let readable_archive = get_test_archive().unwrap();
let total_size: u64 = readable_archive
.decompressed_size()
.unwrap()
.try_into()
.unwrap();

let params = ExtractionParameters {
decompression_threads: DECOMPRESSION_THREADS,
..Default::default()
};

let parent = TempDir::new("zip-extract").unwrap();

bench.bytes = total_size;
bench.bench_n(1, |bench| {
bench.iter(move || {
let outdir = TempDir::new_in(parent.path(), "bench-subdir")
.unwrap()
.into_path();
split_extract(&readable_archive, &outdir, params.clone()).unwrap();
});
});
}

#[cfg(not(all(feature = "parallelism", unix)))]
benchmark_group!(benches, extract_basic);

#[cfg(all(feature = "parallelism", unix))]
benchmark_group!(benches, extract_basic, extract_split);

benchmark_main!(benches);
10 changes: 10 additions & 0 deletions src/read.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,13 @@
#[cfg(feature = "xz")]
pub(crate) mod xz;

#[cfg(feature = "parallelism")]
pub(crate) mod pipelining;
#[cfg(all(unix, feature = "parallelism"))]
pub use pipelining::split_extraction::{split_extract, ExtractionParameters, SplitExtractionError};
#[cfg(feature = "parallelism")]
pub(crate) mod split;

// Put the struct declaration in a private module to convince rustdoc to display ZipArchive nicely
pub(crate) mod zip_archive {
use indexmap::IndexMap;
Expand Down Expand Up @@ -1076,6 +1083,9 @@

fn make_writable_dir_all<T: AsRef<Path>>(outpath: T) -> Result<(), ZipError> {
create_dir_all(outpath.as_ref())?;
/* TODO: do we want to automatically make the directory writable? Wouldn't we prefer to
Dismissed Show dismissed Hide dismissed
* respect the write permissions of the extraction dir? Pipelined extraction does not
* mutate permissions like this. */
cosmicexplorer marked this conversation as resolved.
Show resolved Hide resolved
#[cfg(unix)]
{
// Dirs must be writable until all normal files are extracted
Expand Down Expand Up @@ -1604,7 +1614,7 @@
/// `foo/../bar` as `foo/bar` (instead of `bar`). Because of this,
/// [`ZipFile::enclosed_name`] is the better option in most scenarios.
///
/// [`ParentDir`]: `Component::ParentDir`

Check warning on line 1617 in src/read.rs

View workflow job for this annotation

GitHub Actions / style_and_docs (--no-default-features)

unresolved link to `Component::ParentDir`

Check warning on line 1617 in src/read.rs

View workflow job for this annotation

GitHub Actions / style_and_docs (--all-features)

unresolved link to `Component::ParentDir`

Check warning on line 1617 in src/read.rs

View workflow job for this annotation

GitHub Actions / style_and_docs

unresolved link to `Component::ParentDir`
pub fn mangled_name(&self) -> PathBuf {
self.data.file_name_sanitized()
}
Expand Down
Loading
Loading