From 73143a0ad6a19df6fa8cb5c333b4fffaf3dddb6d Mon Sep 17 00:00:00 2001 From: David Caldwell Date: Tue, 19 Nov 2024 10:49:35 -0800 Subject: [PATCH] perf: Faster cde rejection (#255) * Use the tempfile crate instead of the tempdir crate (which is deprecated) https://github.com/rust-lang-deprecated/tempdir?tab=readme-ov-file#deprecation-note * perf: Add benchmark that measures the rejection speed of a large non-zip file * perf: Speed up non-zip rejection by increasing END_WINDOW_SIZE I tested several END_WINDOW_SIZEs across 2 machines: Machine 1: macOS 15.0.1, aarch64 (apfs /tmp) 512: test parse_large_non_zip ... bench: 30,450,608 ns/iter (+/- 673,910) 4096: test parse_large_non_zip ... bench: 7,741,366 ns/iter (+/- 521,101) 8192: test parse_large_non_zip ... bench: 5,807,443 ns/iter (+/- 546,227) 16384: test parse_large_non_zip ... bench: 4,794,314 ns/iter (+/- 419,114) 32768: test parse_large_non_zip ... bench: 4,262,897 ns/iter (+/- 397,582) 65536: test parse_large_non_zip ... bench: 4,060,847 ns/iter (+/- 280,964) Machine 2: Debian testing, x86_64 (tmpfs /tmp) 512: test parse_large_non_zip ... bench: 65,132,581 ns/iter (+/- 7,429,976) 4096: test parse_large_non_zip ... bench: 14,109,503 ns/iter (+/- 2,892,086) 8192: test parse_large_non_zip ... bench: 9,942,500 ns/iter (+/- 1,886,063) 16384: test parse_large_non_zip ... bench: 8,205,851 ns/iter (+/- 2,902,041) 32768: test parse_large_non_zip ... bench: 7,012,011 ns/iter (+/- 2,222,879) 65536: test parse_large_non_zip ... bench: 6,577,275 ns/iter (+/- 881,546) In both cases END_WINDOW_SIZE=8192 performed about 6x better than 512 and >8192 didn't make much of a difference on top of that. * perf: Speed up non-zip rejection by limiting search for EOCDR. I benchmarked several search sizes across 2 machines (these benches are using an 8192 END_WINDOW_SIZE): Machine 1: macOS 15.0.1, aarch64 (apfs /tmp) whole file: test parse_large_non_zip ... bench: 5,773,801 ns/iter (+/- 411,277) last 128k: test parse_large_non_zip ... bench: 54,402 ns/iter (+/- 4,126) last 66,000: test parse_large_non_zip ... bench: 36,152 ns/iter (+/- 4,293) Machine 2: Debian testing, x86_64 (tmpfs /tmp) whole file: test parse_large_non_zip ... bench: 9,942,306 ns/iter (+/- 1,963,522) last 128k: test parse_large_non_zip ... bench: 73,604 ns/iter (+/- 16,662) last 66,000: test parse_large_non_zip ... bench: 41,349 ns/iter (+/- 16,812) As you might expect these significantly increase the rejection speed for large non-zip files. 66,000 was the number previously used by zip-rs. It was changed to zero in 7a559457431f3418bcc0bcb9ba68559b3b57de99. 128K is what Info-Zip uses[1]. This seems like a reasonable (non-zero) choice for compatibility reasons. [1] Info-zip is extremely old and doesn't not have an official git repo to link to. However, an unofficial fork can be found here: https://github.com/hiirotsuki/infozip/blob/bb0c4755d44f21bda0744a5e1868d25055a543cc/zipfile.c#L4073 --------- Co-authored-by: Chris Hennick <4961925+Pr0methean@users.noreply.github.com> --- Cargo.toml | 2 +- benches/read_metadata.rs | 20 ++++++++++++++++++-- src/read.rs | 4 ++-- src/spec.rs | 13 ++++++++++--- tests/extract_symlink.rs | 4 ++-- tests/repro_old423.rs | 4 ++-- 6 files changed, 35 insertions(+), 12 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 744e68c02..c0ff120b5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -62,7 +62,7 @@ walkdir = "2.5.0" time = { workspace = true, features = ["formatting", "macros"] } anyhow = "1" clap = { version = "=4.4.18", features = ["derive"] } -tempdir = "0.3.7" +tempfile = "3" [features] aes-crypto = ["aes", "constant_time_eq", "hmac", "pbkdf2", "sha1", "rand", "zeroize"] diff --git a/benches/read_metadata.rs b/benches/read_metadata.rs index 73f2b26ed..1dd4456ca 100644 --- a/benches/read_metadata.rs +++ b/benches/read_metadata.rs @@ -5,7 +5,7 @@ use std::io::{self, prelude::*, Cursor}; use bencher::Bencher; use getrandom::getrandom; -use tempdir::TempDir; +use tempfile::TempDir; use zip::write::SimpleFileOptions; use zip::{result::ZipResult, CompressionMethod, ZipArchive, ZipWriter}; @@ -102,7 +102,7 @@ fn parse_stream_archive(bench: &mut Bencher) { let bytes = generate_random_archive(STREAM_ZIP_ENTRIES, STREAM_FILE_SIZE).unwrap(); /* Write to a temporary file path to incur some filesystem overhead from repeated reads */ - let dir = TempDir::new("stream-bench").unwrap(); + let dir = TempDir::with_prefix("stream-bench").unwrap(); let out = dir.path().join("bench-out.zip"); fs::write(&out, &bytes).unwrap(); @@ -116,11 +116,27 @@ fn parse_stream_archive(bench: &mut Bencher) { bench.bytes = bytes.len() as u64; } +fn parse_large_non_zip(bench: &mut Bencher) { + const FILE_SIZE: usize = 17_000_000; + + // Create a large file that doesn't have a zip header (generating random data _might_ make a zip magic + // number somewhere which is _not_ what we're trying to test). + let dir = TempDir::with_prefix("large-non-zip-bench").unwrap(); + let file = dir.path().join("zeros"); + let buf = vec![0u8; FILE_SIZE]; + fs::write(&file, &buf).unwrap(); + + bench.iter(|| { + assert!(zip::ZipArchive::new(std::fs::File::open(&file).unwrap()).is_err()); + }) +} + benchmark_group!( benches, read_metadata, parse_archive_with_comment, parse_zip64_archive_with_comment, parse_stream_archive, + parse_large_non_zip, ); benchmark_main!(benches); diff --git a/src/read.rs b/src/read.rs index 354d80702..9e0d27010 100644 --- a/src/read.rs +++ b/src/read.rs @@ -1777,7 +1777,7 @@ mod test { use crate::CompressionMethod::Stored; use crate::{ZipArchive, ZipWriter}; use std::io::{Cursor, Read, Write}; - use tempdir::TempDir; + use tempfile::TempDir; #[test] fn invalid_offset() { @@ -1979,7 +1979,7 @@ mod test { v.extend_from_slice(include_bytes!("../tests/data/symlink.zip")); let mut reader = ZipArchive::new(Cursor::new(v)).unwrap(); assert!(reader.by_index(0).unwrap().is_symlink()); - let tempdir = TempDir::new("test_is_symlink")?; + let tempdir = TempDir::with_prefix("test_is_symlink")?; reader.extract(&tempdir).unwrap(); assert!(tempdir.path().join("bar").is_symlink()); Ok(()) diff --git a/src/spec.rs b/src/spec.rs index 1dfb4e5e7..0bd89a9a7 100755 --- a/src/spec.rs +++ b/src/spec.rs @@ -353,9 +353,16 @@ impl Zip32CentralDirectoryEnd { return Err(ZipError::InvalidArchive("Invalid zip header")); } - let search_lower_bound = 0; - - const END_WINDOW_SIZE: usize = 512; + // The End Of Central Directory Record should be the last thing in + // the file and so searching the last 65557 bytes of the file should + // be enough. However, not all zips are well-formed and other + // programs may consume zips with extra junk at the end without + // error, so we go back 128K to be compatible with them. 128K is + // arbitrary, but it matches what Info-Zip does. + const EOCDR_SEARCH_SIZE: u64 = 128 * 1024; + let search_lower_bound = file_length.saturating_sub(EOCDR_SEARCH_SIZE); + + const END_WINDOW_SIZE: usize = 8192; /* TODO: use static_assertions!() */ debug_assert!(END_WINDOW_SIZE > mem::size_of::()); diff --git a/tests/extract_symlink.rs b/tests/extract_symlink.rs index 7135df50b..a53504ba6 100644 --- a/tests/extract_symlink.rs +++ b/tests/extract_symlink.rs @@ -2,13 +2,13 @@ #[cfg(all(unix, feature = "_deflate-any"))] fn extract_should_respect_links() { use std::{fs, io, path::PathBuf, str::FromStr}; - use tempdir::TempDir; + use tempfile::TempDir; use zip::ZipArchive; let mut v = Vec::new(); v.extend_from_slice(include_bytes!("data/pandoc_soft_links.zip")); let mut archive = ZipArchive::new(io::Cursor::new(v)).expect("couldn't open test zip file"); - let temp_dir = TempDir::new("pandoc_soft_links").unwrap(); + let temp_dir = TempDir::with_prefix("pandoc_soft_links").unwrap(); archive.extract(&temp_dir).unwrap(); let symlink_path = temp_dir.path().join("pandoc-3.2-arm64/bin/pandoc-lua"); diff --git a/tests/repro_old423.rs b/tests/repro_old423.rs index 83adf950b..f87245e5b 100644 --- a/tests/repro_old423.rs +++ b/tests/repro_old423.rs @@ -2,11 +2,11 @@ #[test] fn repro_old423() -> zip::result::ZipResult<()> { use std::io; - use tempdir::TempDir; + use tempfile::TempDir; use zip::ZipArchive; let mut v = Vec::new(); v.extend_from_slice(include_bytes!("data/lin-ub_iwd-v11.zip")); let mut archive = ZipArchive::new(io::Cursor::new(v)).expect("couldn't open test zip file"); - archive.extract(TempDir::new("repro_old423")?) + archive.extract(TempDir::with_prefix("repro_old423")?) }