diff --git a/CHANGELOG-foldiff.md b/CHANGELOG-foldiff.md index ff3f154..bf303e3 100644 --- a/CHANGELOG-foldiff.md +++ b/CHANGELOG-foldiff.md @@ -6,6 +6,9 @@ - replace `anyhow` with custom error types - write custom threading utilities +## 1.3.1 +- reflinks now apply for duplicated files too + ## 1.3.0 - `foldiff upgrade` - upgrade older manifests to new ones - move core `foldiff` functionality to `libfoldiff` diff --git a/Cargo.lock b/Cargo.lock index f02691c..a75341c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -253,7 +253,7 @@ checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" [[package]] name = "foldiff" -version = "1.3.0" +version = "1.3.1" dependencies = [ "anyhow", "clap", @@ -347,7 +347,7 @@ checksum = "561d97a539a36e26a9a5fad1ea11a3039a67714694aaa379433e580854bc3dc5" [[package]] name = "libfoldiff" -version = "1.3.0" +version = "1.3.1" dependencies = [ "anyhow", "countio", diff --git a/foldiff/Cargo.toml b/foldiff/Cargo.toml index def68ab..aadc222 100644 --- a/foldiff/Cargo.toml +++ b/foldiff/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "foldiff" authors = ["Hazel Atkinson"] -version = "1.3.0" +version = "1.3.1" edition = "2021" license-file = "../LICENSE.md" description = "A general purpose diffing tool that operates on folders of mixed text/binary files." diff --git a/foldiff/src/main.rs b/foldiff/src/main.rs index 5c2145d..6179782 100644 --- a/foldiff/src/main.rs +++ b/foldiff/src/main.rs @@ -9,9 +9,9 @@ mod cliutils; #[derive(Parser, Debug)] #[command( - version = "v1.3.0", + version = "v1.3.1", about, - long_version = "v1.3.0 + long_version = "v1.3.1 writing fldf v1.1.0 reading fldf 1.0.0-r, v1.1.0" )] diff --git a/libfoldiff/Cargo.toml b/libfoldiff/Cargo.toml index f95f752..0f247e5 100644 --- a/libfoldiff/Cargo.toml +++ b/libfoldiff/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "libfoldiff" authors = ["Hazel Atkinson"] -version = "1.3.0" +version = "1.3.1" edition = "2021" license-file = "../LICENSE.md" description = "A general purpose diffing library for the FDLF format." diff --git a/libfoldiff/src/applying.rs b/libfoldiff/src/applying.rs index 9640568..42abcc4 100644 --- a/libfoldiff/src/applying.rs +++ b/libfoldiff/src/applying.rs @@ -1,7 +1,7 @@ -use crate::common::create_file; +use crate::common::{copy_rl, copy_rl_hash, create_file}; use crate::manifest::DiffManifest; use crate::reporting::{AutoSpin, CanBeWrappedBy, Reporter, ReporterSized, ReportingMultiWrapper}; -use crate::{aggregate_errors, handle_res_parit, hash, zstddiff}; +use crate::{aggregate_errors, handle_res_async, handle_res_parit, hash, throw_err_async, zstddiff}; use anyhow::{anyhow, Context}; use memmap2::Mmap; use rayon::prelude::*; @@ -32,7 +32,8 @@ impl ApplyingDiff { let diff_map = &**self.read.as_ref().ok_or(anyhow!("Cannot call apply() on a state without a set `read` prop"))?; - let num_duped_files: u64 = self.manifest.duplicated_files.iter().map(|d| d.new_paths.len() as u64).sum(); + let num_duped_copy: usize = self.manifest.duplicated_files.iter().filter(|d| d.idx == u64::MAX).map(|d| d.new_paths.len()).sum(); + let num_duped_create: usize = self.manifest.duplicated_files.iter().filter(|d| d.idx != u64::MAX).map(|d| d.new_paths.len()).sum(); // incr bar and finish if done let inc_n = |n: usize, b: &TBar| { @@ -46,8 +47,8 @@ impl ApplyingDiff { // progress reporting let wrap = TWrap::new(); let spn = TSpin::new("Applying diff").add_to(&wrap); - let bar_untouched = ::new("Copying unchanged files", self.manifest.untouched_files.len() + (num_duped_files as usize)).add_to(&wrap); - let bar_new = ::new("Creating new files", self.manifest.new_files.len()).add_to(&wrap); + let bar_untouched = ::new("Copying unchanged files", self.manifest.untouched_files.len() + num_duped_copy).add_to(&wrap); + let bar_new = ::new("Creating new files", self.manifest.new_files.len() + num_duped_create).add_to(&wrap); let bar_patched = ::new("Applying patched files", self.manifest.patched_files.len()).add_to(&wrap); let as1 = AutoSpin::spin(&spn); @@ -74,23 +75,7 @@ impl ApplyingDiff { let old_path = self.old_root.join(p); let new_path = self.new_root.join(p); - let real_hash = - // if we're on *nix, try reflinking - if cfg!(unix) && reflink::reflink(&old_path, &new_path).is_ok() { - // reflinked, check the hash - handle_res_parit!(hash::hash_file(&old_path), "Failed to hash file copied from {}", p) - } - else { - // reflink failed or we're on windows, copy - // copying in kernel space would be slightly faster but we have to check the hash - let mut src = handle_res_parit!(File::open(&old_path), "Failed to open file to copy from {}", p); - let mut dst = handle_res_parit!(create_file(&new_path), "Failed to create file to copy to {}", p); - - let mut hw = hash::XXHashStreamer::new(&mut dst); - handle_res_parit!(std::io::copy(&mut src, &mut hw), "Failed to copy file {}", p); - - hw.finish() - }; + let real_hash = handle_res_parit!(copy_rl_hash(old_path, new_path)); if real_hash != h { return Some(anyhow!("Found {p} was different to expected (hash was {real_hash}, not {})", h)); @@ -131,56 +116,73 @@ impl ApplyingDiff { } // okay, now copy to all the new places then - let mut checks: Vec<_> = d.new_paths - .par_iter() - .filter_map(|p| { - // if we have a file on disk, then perform an in-kernel copy for speed - if d.idx == u64::MAX { - // ensure we have a parent directory - let dest_path = self.new_root.join(p); - if let Some(par) = dest_path.parent() { - handle_res_parit!(std::fs::create_dir_all(par), "Failed to create parent dir to copy file {p}"); - } - - handle_res_parit!(std::fs::copy(self.old_root.join(&d.old_paths[0]), dest_path), "Failed to copy file {p}"); + // if we have a file on disk, then perform an in-kernel copy for speed + let mut checks: Vec<_> = + if d.idx == u64::MAX { + d.new_paths + .par_iter() + .filter_map(|p| { + // ensure we have a parent directory + let dest_path = self.new_root.join(p); + if let Some(par) = dest_path.parent() { + handle_res_parit!(std::fs::create_dir_all(par), "Failed to create parent dir to copy file {p}"); + } + + handle_res_parit!(copy_rl(self.old_root.join(&d.old_paths[0]), dest_path), "Failed to copy file {p}"); + None + }) + .collect() + } + else { + // we need to copy out of ourself + let blob = if let Some(t) = self.blobs_new.get(d.idx as usize) { + *t as usize } else { - // we need to copy out of ourself - let blob = if let Some(t) = self.blobs_new.get(d.idx as usize) { - *t as usize - } - else { - return Some(anyhow!("new file {} had an out-of-range index pointing to its data", p)); - }; - - // read length - let len = u64::from_be_bytes(*diff_map[blob..].first_chunk().unwrap()) as usize; - let blob = blob + 8; // advance past length - - // TODO: reflink - // copy - let mut read = Cursor::new(&diff_map[blob..(blob + len)]); - let f = handle_res_parit!(create_file(&self.new_root.join(p)), "Failed to create new file {p} to write to"); - let mut writer = hash::XXHashStreamer::new(f); - - handle_res_parit!(std::io::copy(&mut read, &mut writer)); - - // check hash - let rh = writer.finish(); - if rh != d.hash { - return Some(anyhow!("Newly created file {p} does not match expected data")) - } + throw_err_async!(errs, anyhow!("new file {} had an out-of-range index pointing to its data", d.new_paths[0])); + }; + + // read length + let len = u64::from_be_bytes(*diff_map[blob..].first_chunk().unwrap()) as usize; + let blob = blob + 8; // advance past length + + // copy one out + let p = &d.new_paths[0]; + let mut read = Cursor::new(&diff_map[blob..(blob + len)]); + let f = handle_res_async!(errs, create_file(&self.new_root.join(p)), "Failed to create new file {p} to write to"); + let mut writer = hash::XXHashStreamer::new(f); + + handle_res_async!(errs, std::io::copy(&mut read, &mut writer)); + + // check hash + let rh = writer.finish(); + if rh != d.hash { + throw_err_async!(errs, anyhow!("Newly created file {p} does not match expected data")); } - None - }) - .collect(); + + // copy to the rest + d.new_paths + .par_iter() + .skip(1) + .filter_map(|p| { + // ensure we have a parent directory + let dest_path = self.new_root.join(p); + if let Some(par) = dest_path.parent() { + handle_res_parit!(std::fs::create_dir_all(par), "Failed to create parent dir to copy file {p}"); + } + + handle_res_parit!(copy_rl(self.old_root.join(&d.old_paths[0]), dest_path), "Failed to copy file {p}"); + None + }) + .collect() + }; if !checks.is_empty() { errs.lock().unwrap().extend(checks.drain(..)); return; } - inc_n(d.new_paths.len(), &bar_untouched); + inc_n(d.new_paths.len(), if d.idx == u64::MAX { &bar_untouched } else { &bar_new }); } }); } diff --git a/libfoldiff/src/common.rs b/libfoldiff/src/common.rs index a4c3655..0a575ec 100644 --- a/libfoldiff/src/common.rs +++ b/libfoldiff/src/common.rs @@ -1,5 +1,7 @@ use std::fs::File; use std::path::Path; +use anyhow::Context; +use crate::hash; pub const MAGIC_BYTES: [u8; 4] = *b"FLDF"; pub const VERSION_NUMBER_1_0_0_R: [u8; 4] = [1, 0, 0, b'r']; // v1.0.0-r @@ -22,6 +24,42 @@ pub fn create_file(p: &Path) -> std::io::Result { File::create(p) } +// Reflinks or copies a file and hashes it +pub fn copy_rl_hash(src_p: impl AsRef, dst_p: impl AsRef) -> anyhow::Result { + let src_p = src_p.as_ref(); + let dst_p = dst_p.as_ref(); + + // if we're on *nix, try reflinking + if cfg!(unix) && reflink::reflink(&src_p, &dst_p).is_ok() { + // reflinked, check the hash + hash::hash_file(&src_p).context(format!("Failed to hash file copied from {src_p:?}")) + } + else { + // reflink failed or we're on windows, copy + // copying in kernel space would be slightly faster but we have to check the hash + let mut src = File::open(&src_p).context(format!("Failed to open file to copy from {src_p:?}"))?; + let mut dst = create_file(&dst_p).context(format!("Failed to create file to copy to {dst_p:?}"))?; + + let mut hw = hash::XXHashStreamer::new(&mut dst); + std::io::copy(&mut src, &mut hw).context(format!("Failed to copy file {src_p:?}"))?; + + Ok(hw.finish()) + } +} + +pub fn copy_rl(src_p: impl AsRef, dst_p: impl AsRef) -> std::io::Result<()> { + let src_p = src_p.as_ref(); + let dst_p = dst_p.as_ref(); + + // if we're on *nix, try reflinking + if cfg!(unix) && reflink::reflink(&src_p, &dst_p).is_ok() { + Ok(()) + } + else { + std::fs::copy(src_p, dst_p).map(|_| ()) + } +} + /// If a vec is empty, do nothing. If it contains some errors, aggregate and return them. #[macro_export] macro_rules! aggregate_errors { diff --git a/libfoldiff/src/threading.rs b/libfoldiff/src/threading.rs index 08c3b38..ab2676b 100644 --- a/libfoldiff/src/threading.rs +++ b/libfoldiff/src/threading.rs @@ -23,6 +23,15 @@ macro_rules! handle_res_async { v.unwrap() } }}; + ($errs:expr, $res:expr) => {{ + let v = $res; + if let Err(e) = v { + throw_err_async!($errs, anyhow::anyhow!(format!("{e:?}"))); + } + else { + v.unwrap() + } + }}; } /// unwraps res and, if it's an error, returns Some(err)