-
Notifications
You must be signed in to change notification settings - Fork 25
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add read support for v2016, v2020 chunkers
In addition to the FastCDC types in the v2016 and v2020 modules, now there are StreamCDC structs that will read from a boxed Read into a buffer sized to fit the maximum chunk. While this is convenient for processing large files, it is a bit slower than using memory-mapped files with a crate such as memmap2. Added examples that demonstrate using the streaming chunkers. cargo test passes
- Loading branch information
Showing
12 changed files
with
979 additions
and
214 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,16 +1,11 @@ | ||
* Action Items | ||
** TODO Rewrite | ||
*** TODO incorporate some form of streaming support based on =Read= | ||
**** c.f. https://gitlab.com/asuran-rs/asuran/ (asuran-chunker, uses =fastcdc= with =Read=) | ||
**** basically just allocate a buffer 2*max and fill it as needed | ||
**** c.f. https://github.com/jotfs/fastcdc-go/blob/master/fastcdc.go | ||
**** c.f. https://github.com/wxiacode/Duplicacy-FastCDC/blob/master/src/duplicacy_chunkmaker.go | ||
*** TODO test: check if ronomon version of fastcdc produces same results as rust version | ||
**** if so, maybe make this a requirement of the rust version of ronomon | ||
** timing on =MSEdge-Win10.ova= with 4mb chunks | ||
*** run with =--release= flag 7 times, drop low/high, average remaining 5 | ||
| chunker | avg time | | ||
|---------+----------| | ||
| v2020 | 3.437 | | ||
| ronomon | 4.085 | | ||
| v2016 | 4.266 | | ||
** Time for examples to chunk =MSEdge-Win10.ova= with 4mb chunks | ||
*** use =time cargo run --release ...=, 7 times, drop low/high, average remaining 5 | ||
*** note that the non-streaming examples use =memmap2= to read from the file as a slice | ||
| chunker | avg time | | ||
|------------+----------| | ||
| v2020 | 3.437 | | ||
| ronomon | 4.085 | | ||
| v2016 | 4.266 | | ||
| stream2020 | 5.847 | | ||
| stream2016 | 6.659 | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
// | ||
// Copyright (c) 2023 Nathan Fiedler | ||
// | ||
use clap::{arg, command, value_parser, Arg}; | ||
use fastcdc::v2016::*; | ||
use std::fs::File; | ||
|
||
fn main() { | ||
let matches = command!("Example of using v2016 streaming chunker.") | ||
.about("Finds the content-defined chunk boundaries of a file.") | ||
.arg( | ||
arg!( | ||
-s --size <SIZE> "The desired average size of the chunks." | ||
) | ||
.value_parser(value_parser!(u32)), | ||
) | ||
.arg( | ||
Arg::new("INPUT") | ||
.help("Sets the input file to use") | ||
.required(true) | ||
.index(1), | ||
) | ||
.get_matches(); | ||
let size = matches.get_one::<u32>("size").unwrap_or(&131072); | ||
let avg_size = *size; | ||
let filename = matches.get_one::<String>("INPUT").unwrap(); | ||
let file = File::open(filename).expect("cannot open file!"); | ||
let min_size = avg_size / 4; | ||
let max_size = avg_size * 4; | ||
let chunker = StreamCDC::new(Box::new(file), min_size, avg_size, max_size); | ||
for result in chunker { | ||
let entry = result.expect("failed to read chunk"); | ||
println!( | ||
"hash={} offset={} size={}", | ||
entry.hash, entry.offset, entry.length | ||
); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
// | ||
// Copyright (c) 2023 Nathan Fiedler | ||
// | ||
use clap::{arg, command, value_parser, Arg}; | ||
use fastcdc::v2020::*; | ||
use std::fs::File; | ||
|
||
fn main() { | ||
let matches = command!("Example of using v2020 streaming chunker.") | ||
.about("Finds the content-defined chunk boundaries of a file.") | ||
.arg( | ||
arg!( | ||
-s --size <SIZE> "The desired average size of the chunks." | ||
) | ||
.value_parser(value_parser!(u32)), | ||
) | ||
.arg( | ||
Arg::new("INPUT") | ||
.help("Sets the input file to use") | ||
.required(true) | ||
.index(1), | ||
) | ||
.get_matches(); | ||
let size = matches.get_one::<u32>("size").unwrap_or(&131072); | ||
let avg_size = *size; | ||
let filename = matches.get_one::<String>("INPUT").unwrap(); | ||
let file = File::open(filename).expect("cannot open file!"); | ||
let min_size = avg_size / 4; | ||
let max_size = avg_size * 4; | ||
let chunker = StreamCDC::new(Box::new(file), min_size, avg_size, max_size); | ||
for result in chunker { | ||
let entry = result.expect("failed to read chunk"); | ||
println!( | ||
"hash={} offset={} size={}", | ||
entry.hash, entry.offset, entry.length | ||
); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.