Skip to content

Commit

Permalink
reflink duplicated files too
Browse files Browse the repository at this point in the history
  • Loading branch information
yellowsink committed Oct 2, 2024
1 parent 2806327 commit cd580a3
Show file tree
Hide file tree
Showing 8 changed files with 121 additions and 69 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG-foldiff.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
- replace `anyhow` with custom error types
- write custom threading utilities

## 1.3.1
- reflinks now apply for duplicated files too

## 1.3.0
- `foldiff upgrade` - upgrade older manifests to new ones
- move core `foldiff` functionality to `libfoldiff`
Expand Down
4 changes: 2 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion foldiff/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
name = "foldiff"
authors = ["Hazel Atkinson"]
version = "1.3.0"
version = "1.3.1"
edition = "2021"
license-file = "../LICENSE.md"
description = "A general purpose diffing tool that operates on folders of mixed text/binary files."
Expand Down
4 changes: 2 additions & 2 deletions foldiff/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ mod cliutils;

#[derive(Parser, Debug)]
#[command(
version = "v1.3.0",
version = "v1.3.1",
about,
long_version = "v1.3.0
long_version = "v1.3.1
writing fldf v1.1.0
reading fldf 1.0.0-r, v1.1.0"
)]
Expand Down
2 changes: 1 addition & 1 deletion libfoldiff/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
name = "libfoldiff"
authors = ["Hazel Atkinson"]
version = "1.3.0"
version = "1.3.1"
edition = "2021"
license-file = "../LICENSE.md"
description = "A general purpose diffing library for the FDLF format."
Expand Down
128 changes: 65 additions & 63 deletions libfoldiff/src/applying.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use crate::common::create_file;
use crate::common::{copy_rl, copy_rl_hash, create_file};
use crate::manifest::DiffManifest;
use crate::reporting::{AutoSpin, CanBeWrappedBy, Reporter, ReporterSized, ReportingMultiWrapper};
use crate::{aggregate_errors, handle_res_parit, hash, zstddiff};
use crate::{aggregate_errors, handle_res_async, handle_res_parit, hash, throw_err_async, zstddiff};
use anyhow::{anyhow, Context};
use memmap2::Mmap;
use rayon::prelude::*;
Expand Down Expand Up @@ -32,7 +32,8 @@ impl ApplyingDiff {

let diff_map = &**self.read.as_ref().ok_or(anyhow!("Cannot call apply() on a state without a set `read` prop"))?;

let num_duped_files: u64 = self.manifest.duplicated_files.iter().map(|d| d.new_paths.len() as u64).sum();
let num_duped_copy: usize = self.manifest.duplicated_files.iter().filter(|d| d.idx == u64::MAX).map(|d| d.new_paths.len()).sum();
let num_duped_create: usize = self.manifest.duplicated_files.iter().filter(|d| d.idx != u64::MAX).map(|d| d.new_paths.len()).sum();

// incr bar and finish if done
let inc_n = |n: usize, b: &TBar| {
Expand All @@ -46,8 +47,8 @@ impl ApplyingDiff {
// progress reporting
let wrap = TWrap::new();
let spn = TSpin::new("Applying diff").add_to(&wrap);
let bar_untouched = <TBar as ReporterSized>::new("Copying unchanged files", self.manifest.untouched_files.len() + (num_duped_files as usize)).add_to(&wrap);
let bar_new = <TBar as ReporterSized>::new("Creating new files", self.manifest.new_files.len()).add_to(&wrap);
let bar_untouched = <TBar as ReporterSized>::new("Copying unchanged files", self.manifest.untouched_files.len() + num_duped_copy).add_to(&wrap);
let bar_new = <TBar as ReporterSized>::new("Creating new files", self.manifest.new_files.len() + num_duped_create).add_to(&wrap);
let bar_patched = <TBar as ReporterSized>::new("Applying patched files", self.manifest.patched_files.len()).add_to(&wrap);

let as1 = AutoSpin::spin(&spn);
Expand All @@ -74,23 +75,7 @@ impl ApplyingDiff {
let old_path = self.old_root.join(p);
let new_path = self.new_root.join(p);

let real_hash =
// if we're on *nix, try reflinking
if cfg!(unix) && reflink::reflink(&old_path, &new_path).is_ok() {
// reflinked, check the hash
handle_res_parit!(hash::hash_file(&old_path), "Failed to hash file copied from {}", p)
}
else {
// reflink failed or we're on windows, copy
// copying in kernel space would be slightly faster but we have to check the hash
let mut src = handle_res_parit!(File::open(&old_path), "Failed to open file to copy from {}", p);
let mut dst = handle_res_parit!(create_file(&new_path), "Failed to create file to copy to {}", p);

let mut hw = hash::XXHashStreamer::new(&mut dst);
handle_res_parit!(std::io::copy(&mut src, &mut hw), "Failed to copy file {}", p);

hw.finish()
};
let real_hash = handle_res_parit!(copy_rl_hash(old_path, new_path));

if real_hash != h {
return Some(anyhow!("Found {p} was different to expected (hash was {real_hash}, not {})", h));
Expand Down Expand Up @@ -131,56 +116,73 @@ impl ApplyingDiff {
}

// okay, now copy to all the new places then
let mut checks: Vec<_> = d.new_paths
.par_iter()
.filter_map(|p| {
// if we have a file on disk, then perform an in-kernel copy for speed
if d.idx == u64::MAX {
// ensure we have a parent directory
let dest_path = self.new_root.join(p);
if let Some(par) = dest_path.parent() {
handle_res_parit!(std::fs::create_dir_all(par), "Failed to create parent dir to copy file {p}");
}

handle_res_parit!(std::fs::copy(self.old_root.join(&d.old_paths[0]), dest_path), "Failed to copy file {p}");
// if we have a file on disk, then perform an in-kernel copy for speed
let mut checks: Vec<_> =
if d.idx == u64::MAX {
d.new_paths
.par_iter()
.filter_map(|p| {
// ensure we have a parent directory
let dest_path = self.new_root.join(p);
if let Some(par) = dest_path.parent() {
handle_res_parit!(std::fs::create_dir_all(par), "Failed to create parent dir to copy file {p}");
}

handle_res_parit!(copy_rl(self.old_root.join(&d.old_paths[0]), dest_path), "Failed to copy file {p}");
None
})
.collect()
}
else {
// we need to copy out of ourself
let blob = if let Some(t) = self.blobs_new.get(d.idx as usize) {
*t as usize
}
else {
// we need to copy out of ourself
let blob = if let Some(t) = self.blobs_new.get(d.idx as usize) {
*t as usize
}
else {
return Some(anyhow!("new file {} had an out-of-range index pointing to its data", p));
};

// read length
let len = u64::from_be_bytes(*diff_map[blob..].first_chunk().unwrap()) as usize;
let blob = blob + 8; // advance past length

// TODO: reflink
// copy
let mut read = Cursor::new(&diff_map[blob..(blob + len)]);
let f = handle_res_parit!(create_file(&self.new_root.join(p)), "Failed to create new file {p} to write to");
let mut writer = hash::XXHashStreamer::new(f);

handle_res_parit!(std::io::copy(&mut read, &mut writer));

// check hash
let rh = writer.finish();
if rh != d.hash {
return Some(anyhow!("Newly created file {p} does not match expected data"))
}
throw_err_async!(errs, anyhow!("new file {} had an out-of-range index pointing to its data", d.new_paths[0]));
};

// read length
let len = u64::from_be_bytes(*diff_map[blob..].first_chunk().unwrap()) as usize;
let blob = blob + 8; // advance past length

// copy one out
let p = &d.new_paths[0];
let mut read = Cursor::new(&diff_map[blob..(blob + len)]);
let f = handle_res_async!(errs, create_file(&self.new_root.join(p)), "Failed to create new file {p} to write to");
let mut writer = hash::XXHashStreamer::new(f);

handle_res_async!(errs, std::io::copy(&mut read, &mut writer));

// check hash
let rh = writer.finish();
if rh != d.hash {
throw_err_async!(errs, anyhow!("Newly created file {p} does not match expected data"));
}
None
})
.collect();

// copy to the rest
d.new_paths
.par_iter()
.skip(1)
.filter_map(|p| {
// ensure we have a parent directory
let dest_path = self.new_root.join(p);
if let Some(par) = dest_path.parent() {
handle_res_parit!(std::fs::create_dir_all(par), "Failed to create parent dir to copy file {p}");
}

handle_res_parit!(copy_rl(self.old_root.join(&d.old_paths[0]), dest_path), "Failed to copy file {p}");
None
})
.collect()
};

if !checks.is_empty() {
errs.lock().unwrap().extend(checks.drain(..));
return;
}

inc_n(d.new_paths.len(), &bar_untouched);
inc_n(d.new_paths.len(), if d.idx == u64::MAX { &bar_untouched } else { &bar_new });
}
});
}
Expand Down
38 changes: 38 additions & 0 deletions libfoldiff/src/common.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
use std::fs::File;
use std::path::Path;
use anyhow::Context;
use crate::hash;

pub const MAGIC_BYTES: [u8; 4] = *b"FLDF";
pub const VERSION_NUMBER_1_0_0_R: [u8; 4] = [1, 0, 0, b'r']; // v1.0.0-r
Expand All @@ -22,6 +24,42 @@ pub fn create_file(p: &Path) -> std::io::Result<File> {
File::create(p)
}

// Reflinks or copies a file and hashes it
pub fn copy_rl_hash(src_p: impl AsRef<Path>, dst_p: impl AsRef<Path>) -> anyhow::Result<u64> {
let src_p = src_p.as_ref();
let dst_p = dst_p.as_ref();

// if we're on *nix, try reflinking
if cfg!(unix) && reflink::reflink(&src_p, &dst_p).is_ok() {
// reflinked, check the hash
hash::hash_file(&src_p).context(format!("Failed to hash file copied from {src_p:?}"))
}
else {
// reflink failed or we're on windows, copy
// copying in kernel space would be slightly faster but we have to check the hash
let mut src = File::open(&src_p).context(format!("Failed to open file to copy from {src_p:?}"))?;
let mut dst = create_file(&dst_p).context(format!("Failed to create file to copy to {dst_p:?}"))?;

let mut hw = hash::XXHashStreamer::new(&mut dst);
std::io::copy(&mut src, &mut hw).context(format!("Failed to copy file {src_p:?}"))?;

Ok(hw.finish())
}
}

pub fn copy_rl(src_p: impl AsRef<Path>, dst_p: impl AsRef<Path>) -> std::io::Result<()> {
let src_p = src_p.as_ref();
let dst_p = dst_p.as_ref();

// if we're on *nix, try reflinking
if cfg!(unix) && reflink::reflink(&src_p, &dst_p).is_ok() {
Ok(())
}
else {
std::fs::copy(src_p, dst_p).map(|_| ())
}
}

/// If a vec is empty, do nothing. If it contains some errors, aggregate and return them.
#[macro_export]
macro_rules! aggregate_errors {
Expand Down
9 changes: 9 additions & 0 deletions libfoldiff/src/threading.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,15 @@ macro_rules! handle_res_async {
v.unwrap()
}
}};
($errs:expr, $res:expr) => {{
let v = $res;
if let Err(e) = v {
throw_err_async!($errs, anyhow::anyhow!(format!("{e:?}")));
}
else {
v.unwrap()
}
}};
}

/// unwraps res and, if it's an error, returns Some(err)
Expand Down

0 comments on commit cd580a3

Please sign in to comment.