From deec9c3f6b69d5d410e0d826fc09c162929fcd42 Mon Sep 17 00:00:00 2001 From: Justin Date: Wed, 6 Sep 2023 16:08:11 +1000 Subject: [PATCH 01/17] simplify --- src/lib.rs | 173 ++++++++++++++++++++++++++++------------------------- 1 file changed, 92 insertions(+), 81 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index bfe463c..b965390 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -18,7 +18,7 @@ use std::time::Instant; use typed_arena::Arena; use walkdir::DirEntry; use walkdir::WalkDir; - +use std::collections::HashSet; use dashmap::DashMap; struct HashableDirEntry(DirEntry); @@ -72,22 +72,13 @@ pub struct Opt { } impl Opt { - pub fn from_paths(paths: Vec) -> Opt { - Opt { - paths, - timing: false, - debug: false, - output: None, - minimum: None, - sort: false, - } - } + } type BoxResult = Result>; // given a path, returns the filesize of the file at that path -fn byte_count_file(path: &HashableDirEntry) -> BoxResult { +fn byte_count_file(path: &DirEntry) -> BoxResult { let metadata = path.metadata()?; Ok(metadata.len()) } @@ -95,7 +86,7 @@ fn byte_count_file(path: &HashableDirEntry) -> BoxResult { use seahash::SeaHasher; // given a path, returns a hash of all the bytes of the file at that path -fn hash_file(path: &HashableDirEntry) -> BoxResult { +fn hash_file(path: &DirEntry) -> BoxResult { let file = File::open(path.path())?; let mut hasher = SeaHasher::new(); let mut reader = BufReader::new(file); @@ -113,7 +104,7 @@ fn hash_file(path: &HashableDirEntry) -> BoxResult { Ok(hasher.finish()) } -// given a path, returns a hash of all the bytes of the file at that path +// given a path, returns a hash of the first few bytes of the file at that path fn hash_start_file(path: &HashableDirEntry) -> BoxResult { let file = File::open(path.path())?; let mut hasher = SeaHasher::new(); @@ -151,8 +142,7 @@ fn print_timing_info(now: Instant) { fn walk_dirs( input: Vec, - arena: &Arena, -) -> DashMap<&HashableDirEntry, ()> { +) -> Vec { let vec: Vec = input .par_iter() .map(|path| { @@ -164,34 +154,35 @@ fn walk_dirs( }) .flatten() .collect(); - let paths = DashMap::new(); + let mut paths = Vec::new(); for entry in vec { - let item = arena.alloc(HashableDirEntry(entry)); - paths.insert(&*item, ()); + let item = CandidateFile { + path: entry, + size: None, + start_hash: None, + full_hash: None, + }; + paths.push(item); } paths } + fn cull_by_filesize( - input: DashMap<&HashableDirEntry, ()>, + input: Vec, minimum: u64, -) -> DashMap<&HashableDirEntry, u64> { - let dupes = DashMap::new(); - let file_hashes = DashMap::new(); - input - .into_iter() - .par_bridge() - .for_each(|(current_path, _)| { - if let Ok(bytes_count) = byte_count_file(current_path) { - if bytes_count >= minimum { - if let Some(path) = file_hashes.insert(bytes_count, current_path) { - dupes.insert(current_path, bytes_count); - dupes.insert(path, bytes_count); - } - } +) -> Vec { + let mut out = Vec::new(); + for mut candidate in input { + let current_path = &candidate.path; + if let Ok(bytes_count) = byte_count_file(¤t_path) { + if bytes_count >= minimum { + candidate.size = Some(bytes_count); + out.push(candidate) } - }); - dupes + } + } + out } fn cull_by_start(input: DashMap<&HashableDirEntry, u64>) -> DashMap<&HashableDirEntry, u64> { @@ -214,33 +205,46 @@ fn cull_by_start(input: DashMap<&HashableDirEntry, u64>) -> DashMap<&HashableDir } fn cull_by_hash( - input: DashMap<&HashableDirEntry, u64>, -) -> Vec<(&HashableDirEntry, &HashableDirEntry, u64)> { - let file_hashes = DashMap::new(); - input - .into_iter() - .par_bridge() - .filter_map(|(current_path, bytes_count)| { - if let Ok(hash) = hash_file(current_path) { - if let Some(path) = file_hashes.insert(hash, current_path) { - return Some((current_path, path, bytes_count)); - } + mut input: Vec, +) -> Vec { + + for candidate in input.iter_mut() { + let current_path = &candidate.path; + if let Ok(hash) = hash_file(current_path) { + candidate.full_hash = Some(hash); + } + } + + let mut hashes = HashSet::new(); + let mut dupe_hashes = Vec::new(); + for candidate in &input { + if let Some(hash) = candidate.full_hash { + if hashes.contains(&hash){ + dupe_hashes.push(hash) + } else { + hashes.insert(hash); } - None - }) - .collect::>() + } + } + + let mut out = Vec::new(); + for candidate in input { + if let Some(hash) = candidate.full_hash{ + if dupe_hashes.contains(&hash){ + out.push(candidate) + } + } + } + out } -fn format_results(input: &[(&HashableDirEntry, &HashableDirEntry, u64)]) -> Vec { +fn format_results(input: Vec) -> Vec { input .par_iter() .map(|item| { - let (dupe1, dupe2, bytes_count) = item; format!( - "{}: {} | {} \n", - bytes_count, - dupe1.path().display(), - dupe2.path().display() + "{}: \n", + item.path.path().display() ) }) .collect::>() @@ -252,17 +256,24 @@ fn maybe_send_progress<'a>(progress: &Option>, message: &'a str) } } +struct CandidateFile { + path: DirEntry, + size: Option, + start_hash: Option, + full_hash: Option, +} + + pub fn detect_dupes(options: Opt, progress: Option>) -> Vec { let now = Instant::now(); maybe_send_progress(&progress, "Walking dirs"); - let arena = Arena::new(); - let paths = walk_dirs(options.paths, &arena); + let paths = walk_dirs(options.paths); if options.debug { println!("{} files found ", paths.len()); } - let minimum = options.minimum.unwrap_or(0); + let minimum = options.minimum.unwrap_or(1); maybe_send_progress(&progress, "Culling by filesizes"); let paths = cull_by_filesize(paths, minimum); @@ -271,31 +282,31 @@ pub fn detect_dupes(options: Opt, progress: Option>) -> Vec println!("{} potential dupes after filesize cull", paths.len()); } - maybe_send_progress(&progress, "Culling by start"); - let paths = cull_by_start(paths); + // maybe_send_progress(&progress, "Culling by start"); + // let paths = cull_by_start(paths); - if options.debug { - println!("{} potential dupes after start cull", paths.len()); - } + // if options.debug { + // println!("{} potential dupes after start cull", paths.len()); + // } maybe_send_progress(&progress, "Culling by hash"); - let mut confirmed_dupes = cull_by_hash(paths); - - if options.debug { - println!("{} dupes after full file hashing", confirmed_dupes.len()); - } - if options.sort { - confirmed_dupes.sort_by_cached_key(|confirmed_dup| confirmed_dup.2); - } + let paths = cull_by_hash(paths); + + // if options.debug { + // println!("{} dupes after full file hashing", confirmed_dupes.len()); + // } + // if options.sort { + // confirmed_dupes.sort_by_cached_key(|confirmed_dup| confirmed_dup.2); + // } maybe_send_progress(&progress, "Formatting"); - let output_strings = format_results(&confirmed_dupes); - - if let Some(path) = options.output { - let mut f = File::create(path).unwrap(); - f.write_all(output_strings.join("").as_bytes()).unwrap(); - } - if options.timing { - print_timing_info(now); - } + let output_strings = format_results(paths); + + // if let Some(path) = options.output { + // let mut f = File::create(path).unwrap(); + // f.write_all(output_strings.join("").as_bytes()).unwrap(); + // } + // if options.timing { + // print_timing_info(now); + // } output_strings } From 9f9b467e223def06362e6916436a97abd6a38bcc Mon Sep 17 00:00:00 2001 From: Justin Date: Wed, 6 Sep 2023 16:18:36 +1000 Subject: [PATCH 02/17] simplify --- src/lib.rs | 51 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 33 insertions(+), 18 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index b965390..a02feca 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -105,7 +105,7 @@ fn hash_file(path: &DirEntry) -> BoxResult { } // given a path, returns a hash of the first few bytes of the file at that path -fn hash_start_file(path: &HashableDirEntry) -> BoxResult { +fn hash_start_file(path: &DirEntry) -> BoxResult { let file = File::open(path.path())?; let mut hasher = SeaHasher::new(); let mut reader = BufReader::new(file); @@ -185,23 +185,38 @@ fn cull_by_filesize( out } -fn cull_by_start(input: DashMap<&HashableDirEntry, u64>) -> DashMap<&HashableDirEntry, u64> { - let dupes = DashMap::new(); - let file_hashes = DashMap::new(); - input - .into_iter() - .par_bridge() - .for_each(|(current_path, size)| { - if size < 640_000 { - dupes.insert(current_path, size); - } else if let Ok(hash) = hash_start_file(current_path) { - if let Some(path) = file_hashes.insert(hash, current_path) { - dupes.insert(current_path, size); - dupes.insert(path, size); - } +fn cull_by_start( + mut input: Vec, +) -> Vec { + + for candidate in input.iter_mut() { + let current_path = &candidate.path; + if let Ok(hash) = hash_start_file(current_path) { + candidate.start_hash = Some(hash); + } + } + + let mut hashes = HashSet::new(); + let mut dupe_hashes = Vec::new(); + for candidate in &input { + if let Some(hash) = candidate.start_hash { + if hashes.contains(&hash){ + dupe_hashes.push(hash) + } else { + hashes.insert(hash); } - }); - dupes + } + } + + let mut out = Vec::new(); + for candidate in input { + if let Some(hash) = candidate.start_hash{ + if dupe_hashes.contains(&hash){ + out.push(candidate) + } + } + } + out } fn cull_by_hash( @@ -283,7 +298,7 @@ pub fn detect_dupes(options: Opt, progress: Option>) -> Vec } // maybe_send_progress(&progress, "Culling by start"); - // let paths = cull_by_start(paths); + let paths = cull_by_start(paths); // if options.debug { // println!("{} potential dupes after start cull", paths.len()); From f78b1a5c460892838c22696ec0f8d2aee15ac37d Mon Sep 17 00:00:00 2001 From: Justin Date: Wed, 6 Sep 2023 16:36:36 +1000 Subject: [PATCH 03/17] more cleanup --- Cargo.lock | 73 +----------------------------------------------------- Cargo.toml | 2 -- src/lib.rs | 65 +++++++++++++++++++----------------------------- 3 files changed, 26 insertions(+), 114 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 35d0ad6..99d247f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -71,12 +71,6 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" -[[package]] -name = "bitflags" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" - [[package]] name = "bitflags" version = "2.4.0" @@ -262,19 +256,6 @@ dependencies = [ "cfg-if", ] -[[package]] -name = "dashmap" -version = "5.5.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "978747c1d849a7d2ee5e8adc0159961c48fb7e5db2f06af6723b80123bb53856" -dependencies = [ - "cfg-if", - "hashbrown", - "lock_api", - "once_cell", - "parking_lot_core", -] - [[package]] name = "dedupe" version = "2.2.1" @@ -282,10 +263,8 @@ dependencies = [ "clap", "criterion", "crossbeam-channel", - "dashmap", "rayon", "seahash", - "typed-arena", "walkdir", ] @@ -322,12 +301,6 @@ version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" -[[package]] -name = "hashbrown" -version = "0.14.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a" - [[package]] name = "heck" version = "0.4.1" @@ -387,16 +360,6 @@ version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57bcfdad1b858c2db7c38303a6d2ad4dfaf5eb53dfeb0910128b2c26d6158503" -[[package]] -name = "lock_api" -version = "0.4.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16" -dependencies = [ - "autocfg", - "scopeguard", -] - [[package]] name = "log" version = "0.4.20" @@ -449,19 +412,6 @@ version = "11.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" -[[package]] -name = "parking_lot_core" -version = "0.9.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447" -dependencies = [ - "cfg-if", - "libc", - "redox_syscall", - "smallvec", - "windows-targets", -] - [[package]] name = "plotters" version = "0.3.5" @@ -530,15 +480,6 @@ dependencies = [ "num_cpus", ] -[[package]] -name = "redox_syscall" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" -dependencies = [ - "bitflags 1.3.2", -] - [[package]] name = "regex" version = "1.9.5" @@ -574,7 +515,7 @@ version = "0.38.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c0c3dde1fc030af041adc40e79c0e7fbcf431dd24870053d187d7c66e4b87453" dependencies = [ - "bitflags 2.4.0", + "bitflags", "errno", "libc", "linux-raw-sys", @@ -639,12 +580,6 @@ dependencies = [ "serde", ] -[[package]] -name = "smallvec" -version = "1.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9" - [[package]] name = "strsim" version = "0.10.0" @@ -672,12 +607,6 @@ dependencies = [ "serde_json", ] -[[package]] -name = "typed-arena" -version = "2.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6af6ae20167a9ece4bcb41af5b80f8a1f1df981f6391189ce00fd257af04126a" - [[package]] name = "unicode-ident" version = "1.0.11" diff --git a/Cargo.toml b/Cargo.toml index 7c070a1..3d5c379 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,9 +13,7 @@ lto = "thin" walkdir = "2" rayon = "1" crossbeam-channel = "0" -typed-arena = "2" seahash = "4" -dashmap = "5" clap = { version = "4.4.2", features = ["derive"] } [lib] diff --git a/src/lib.rs b/src/lib.rs index a02feca..3f7aa97 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,25 +1,22 @@ extern crate clap; -extern crate crossbeam_channel; extern crate rayon; extern crate walkdir; -use clap::Parser; use crossbeam_channel::Sender; +use clap::Parser; use rayon::prelude::*; use std::error::Error; use std::fs::File; use std::hash::Hasher; use std::io::BufReader; use std::io::Read; -use std::io::Write; use std::ops::Deref; use std::ops::DerefMut; use std::path::PathBuf; use std::time::Instant; -use typed_arena::Arena; use walkdir::DirEntry; use walkdir::WalkDir; use std::collections::HashSet; -use dashmap::DashMap; + struct HashableDirEntry(DirEntry); @@ -63,16 +60,8 @@ pub struct Opt { #[arg(short, long)] pub debug: bool, - #[arg(short, long)] - pub output: Option, #[arg(short, long)] pub minimum: Option, - #[arg(short, long)] - pub sort: bool, -} - -impl Opt { - } type BoxResult = Result>; @@ -85,7 +74,7 @@ fn byte_count_file(path: &DirEntry) -> BoxResult { use seahash::SeaHasher; -// given a path, returns a hash of all the bytes of the file at that path +// given a path, returns a hash of the bytes of the file at that path fn hash_file(path: &DirEntry) -> BoxResult { let file = File::open(path.path())?; let mut hasher = SeaHasher::new(); @@ -104,7 +93,7 @@ fn hash_file(path: &DirEntry) -> BoxResult { Ok(hasher.finish()) } -// given a path, returns a hash of the first few bytes of the file at that path +// given a path, returns a hash of the first 64k bytes of the file at that path fn hash_start_file(path: &DirEntry) -> BoxResult { let file = File::open(path.path())?; let mut hasher = SeaHasher::new(); @@ -186,15 +175,16 @@ fn cull_by_filesize( } fn cull_by_start( - mut input: Vec, + input: Vec, ) -> Vec { - for candidate in input.iter_mut() { + let input: Vec<_> = input.par_iter().cloned().map(|mut candidate| { let current_path = &candidate.path; if let Ok(hash) = hash_start_file(current_path) { candidate.start_hash = Some(hash); } - } + candidate + }).collect(); let mut hashes = HashSet::new(); let mut dupe_hashes = Vec::new(); @@ -220,15 +210,16 @@ fn cull_by_start( } fn cull_by_hash( - mut input: Vec, + input: Vec, ) -> Vec { - for candidate in input.iter_mut() { + let input: Vec<_> = input.par_iter().cloned().map(|mut candidate| { let current_path = &candidate.path; if let Ok(hash) = hash_file(current_path) { - candidate.full_hash = Some(hash); + candidate.start_hash = Some(hash); } - } + candidate + }).collect(); let mut hashes = HashSet::new(); let mut dupe_hashes = Vec::new(); @@ -270,7 +261,7 @@ fn maybe_send_progress<'a>(progress: &Option>, message: &'a str) p.send(message).unwrap(); } } - +#[derive(Clone)] struct CandidateFile { path: DirEntry, size: Option, @@ -297,31 +288,25 @@ pub fn detect_dupes(options: Opt, progress: Option>) -> Vec println!("{} potential dupes after filesize cull", paths.len()); } - // maybe_send_progress(&progress, "Culling by start"); + maybe_send_progress(&progress, "Culling by start"); let paths = cull_by_start(paths); - // if options.debug { - // println!("{} potential dupes after start cull", paths.len()); - // } + if options.debug { + println!("{} potential dupes after start cull", paths.len()); + } maybe_send_progress(&progress, "Culling by hash"); let paths = cull_by_hash(paths); - // if options.debug { - // println!("{} dupes after full file hashing", confirmed_dupes.len()); - // } - // if options.sort { - // confirmed_dupes.sort_by_cached_key(|confirmed_dup| confirmed_dup.2); - // } + if options.debug { + println!("{} dupes after full file hashing", paths.len()); + } + maybe_send_progress(&progress, "Formatting"); let output_strings = format_results(paths); - // if let Some(path) = options.output { - // let mut f = File::create(path).unwrap(); - // f.write_all(output_strings.join("").as_bytes()).unwrap(); - // } - // if options.timing { - // print_timing_info(now); - // } + if options.timing { + print_timing_info(now); + } output_strings } From 14a90ee1f1ef35b13f8fc88f498d9b6bdfc822b6 Mon Sep 17 00:00:00 2001 From: Justin Date: Wed, 6 Sep 2023 16:42:28 +1000 Subject: [PATCH 04/17] more cleanup --- src/lib.rs | 108 ++++++++++++++++++++++++----------------------------- 1 file changed, 49 insertions(+), 59 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 3f7aa97..f689101 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,9 +1,10 @@ extern crate clap; extern crate rayon; extern crate walkdir; -use crossbeam_channel::Sender; use clap::Parser; +use crossbeam_channel::Sender; use rayon::prelude::*; +use std::collections::HashSet; use std::error::Error; use std::fs::File; use std::hash::Hasher; @@ -15,8 +16,6 @@ use std::path::PathBuf; use std::time::Instant; use walkdir::DirEntry; use walkdir::WalkDir; -use std::collections::HashSet; - struct HashableDirEntry(DirEntry); @@ -129,9 +128,7 @@ fn print_timing_info(now: Instant) { ); } -fn walk_dirs( - input: Vec, -) -> Vec { +fn walk_dirs(input: Vec) -> Vec { let vec: Vec = input .par_iter() .map(|path| { @@ -148,19 +145,15 @@ fn walk_dirs( let item = CandidateFile { path: entry, size: None, - start_hash: None, - full_hash: None, + start_hash: None, + full_hash: None, }; paths.push(item); } paths } - -fn cull_by_filesize( - input: Vec, - minimum: u64, -) -> Vec { +fn cull_by_filesize(input: Vec, minimum: u64) -> Vec { let mut out = Vec::new(); for mut candidate in input { let current_path = &candidate.path; @@ -174,24 +167,25 @@ fn cull_by_filesize( out } -fn cull_by_start( - input: Vec, -) -> Vec { - - let input: Vec<_> = input.par_iter().cloned().map(|mut candidate| { - let current_path = &candidate.path; - if let Ok(hash) = hash_start_file(current_path) { - candidate.start_hash = Some(hash); - } - candidate - }).collect(); +fn cull_by_start(input: Vec) -> Vec { + let input: Vec<_> = input + .par_iter() + .cloned() + .map(|mut candidate| { + let current_path = &candidate.path; + if let Ok(hash) = hash_start_file(current_path) { + candidate.start_hash = Some(hash); + } + candidate + }) + .collect(); let mut hashes = HashSet::new(); - let mut dupe_hashes = Vec::new(); + let mut dupe_hashes = HashSet::new(); for candidate in &input { if let Some(hash) = candidate.start_hash { - if hashes.contains(&hash){ - dupe_hashes.push(hash) + if hashes.contains(&hash) { + dupe_hashes.insert(hash); } else { hashes.insert(hash); } @@ -200,8 +194,8 @@ fn cull_by_start( let mut out = Vec::new(); for candidate in input { - if let Some(hash) = candidate.start_hash{ - if dupe_hashes.contains(&hash){ + if let Some(hash) = candidate.start_hash { + if dupe_hashes.contains(&hash) { out.push(candidate) } } @@ -209,24 +203,25 @@ fn cull_by_start( out } -fn cull_by_hash( - input: Vec, -) -> Vec { - - let input: Vec<_> = input.par_iter().cloned().map(|mut candidate| { - let current_path = &candidate.path; - if let Ok(hash) = hash_file(current_path) { - candidate.start_hash = Some(hash); - } - candidate - }).collect(); +fn cull_by_hash(input: Vec) -> Vec { + let input: Vec<_> = input + .par_iter() + .cloned() + .map(|mut candidate| { + let current_path = &candidate.path; + if let Ok(hash) = hash_file(current_path) { + candidate.start_hash = Some(hash); + } + candidate + }) + .collect(); let mut hashes = HashSet::new(); - let mut dupe_hashes = Vec::new(); + let mut dupe_hashes = HashSet::new(); for candidate in &input { if let Some(hash) = candidate.full_hash { - if hashes.contains(&hash){ - dupe_hashes.push(hash) + if hashes.contains(&hash) { + dupe_hashes.insert(hash); } else { hashes.insert(hash); } @@ -235,8 +230,8 @@ fn cull_by_hash( let mut out = Vec::new(); for candidate in input { - if let Some(hash) = candidate.full_hash{ - if dupe_hashes.contains(&hash){ + if let Some(hash) = candidate.full_hash { + if dupe_hashes.contains(&hash) { out.push(candidate) } } @@ -245,15 +240,11 @@ fn cull_by_hash( } fn format_results(input: Vec) -> Vec { - input - .par_iter() - .map(|item| { - format!( - "{}: \n", - item.path.path().display() - ) - }) - .collect::>() + let mut out = Vec::new(); + for item in input { + out.push(format!("{}: \n", item.path.path().display())) + } + out } fn maybe_send_progress<'a>(progress: &Option>, message: &'a str) { @@ -263,13 +254,12 @@ fn maybe_send_progress<'a>(progress: &Option>, message: &'a str) } #[derive(Clone)] struct CandidateFile { - path: DirEntry, - size: Option, - start_hash: Option, - full_hash: Option, + path: DirEntry, + size: Option, + start_hash: Option, + full_hash: Option, } - pub fn detect_dupes(options: Opt, progress: Option>) -> Vec { let now = Instant::now(); maybe_send_progress(&progress, "Walking dirs"); From 09eadf318d3ea2e27fed4cd42abd04fa58a662f7 Mon Sep 17 00:00:00 2001 From: Justin Date: Wed, 6 Sep 2023 16:43:42 +1000 Subject: [PATCH 05/17] more cleanup --- src/lib.rs | 37 +------------------------------------ 1 file changed, 1 insertion(+), 36 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index f689101..e0d20ed 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -10,44 +10,11 @@ use std::fs::File; use std::hash::Hasher; use std::io::BufReader; use std::io::Read; -use std::ops::Deref; -use std::ops::DerefMut; use std::path::PathBuf; use std::time::Instant; use walkdir::DirEntry; use walkdir::WalkDir; - -struct HashableDirEntry(DirEntry); - -impl Deref for HashableDirEntry { - fn deref(&self) -> &Self::Target { - &self.0 - } - type Target = DirEntry; -} - -impl DerefMut for HashableDirEntry { - fn deref_mut(&mut self) -> &mut Self::Target { - &mut self.0 - } -} - -impl std::cmp::Eq for HashableDirEntry {} - -impl std::cmp::PartialEq for HashableDirEntry { - fn eq(&self, rhs: &HashableDirEntry) -> bool { - self.path() == rhs.path() - } -} - -impl std::hash::Hash for HashableDirEntry { - fn hash(&self, h: &mut H) - where - H: std::hash::Hasher, - { - self.path().hash(h); - } -} +use seahash::SeaHasher; #[derive(Parser, Debug, Default)] pub struct Opt { @@ -71,8 +38,6 @@ fn byte_count_file(path: &DirEntry) -> BoxResult { Ok(metadata.len()) } -use seahash::SeaHasher; - // given a path, returns a hash of the bytes of the file at that path fn hash_file(path: &DirEntry) -> BoxResult { let file = File::open(path.path())?; From f84837e0f1c399ab6c1b61acffff49e6633a7dad Mon Sep 17 00:00:00 2001 From: Justin Date: Wed, 6 Sep 2023 16:53:21 +1000 Subject: [PATCH 06/17] more fixups --- Cargo.lock | 3 +-- Cargo.toml | 4 +--- src/bin/cli.rs | 21 +-------------------- src/lib.rs | 13 +------------ 4 files changed, 4 insertions(+), 37 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 99d247f..a410998 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -258,11 +258,10 @@ dependencies = [ [[package]] name = "dedupe" -version = "2.2.1" +version = "3.2.1" dependencies = [ "clap", "criterion", - "crossbeam-channel", "rayon", "seahash", "walkdir", diff --git a/Cargo.toml b/Cargo.toml index 3d5c379..fa1491a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "dedupe" -version = "2.2.1" +version = "3.2.1" authors = ["Justin Lambert "] publish = false edition = "2021" @@ -12,7 +12,6 @@ lto = "thin" [dependencies] walkdir = "2" rayon = "1" -crossbeam-channel = "0" seahash = "4" clap = { version = "4.4.2", features = ["derive"] } @@ -20,7 +19,6 @@ clap = { version = "4.4.2", features = ["derive"] } name = "dupelib" path = "src/lib.rs" - [dev-dependencies] criterion = "0" diff --git a/src/bin/cli.rs b/src/bin/cli.rs index c7dc1dc..83721e2 100644 --- a/src/bin/cli.rs +++ b/src/bin/cli.rs @@ -1,32 +1,13 @@ extern crate clap; -extern crate crossbeam_channel; extern crate dupelib; use clap::Parser; -use crossbeam_channel::unbounded; -use crossbeam_channel::Receiver; -use crossbeam_channel::RecvError; -use crossbeam_channel::Sender; use dupelib::detect_dupes; use dupelib::Opt; -use std::thread; - fn run_dupe_detect(options: Opt) { - let (sender, receiver): (Sender<&str>, Receiver<&str>) = unbounded(); - thread::spawn(move || { - let mut cont = true; - while cont { - cont = false; - let data = receiver.recv(); - if data != Err(RecvError) { - cont = true; - println!("{}", data.unwrap()); - } - } - }); - let dupes = detect_dupes(options, Some(sender)); + let dupes = detect_dupes(options); println!("{:?}", dupes); } diff --git a/src/lib.rs b/src/lib.rs index e0d20ed..745e1e9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,7 +2,6 @@ extern crate clap; extern crate rayon; extern crate walkdir; use clap::Parser; -use crossbeam_channel::Sender; use rayon::prelude::*; use std::collections::HashSet; use std::error::Error; @@ -212,11 +211,6 @@ fn format_results(input: Vec) -> Vec { out } -fn maybe_send_progress<'a>(progress: &Option>, message: &'a str) { - if let Some(p) = progress { - p.send(message).unwrap(); - } -} #[derive(Clone)] struct CandidateFile { path: DirEntry, @@ -225,9 +219,8 @@ struct CandidateFile { full_hash: Option, } -pub fn detect_dupes(options: Opt, progress: Option>) -> Vec { +pub fn detect_dupes(options: Opt) -> Vec { let now = Instant::now(); - maybe_send_progress(&progress, "Walking dirs"); let paths = walk_dirs(options.paths); if options.debug { @@ -236,28 +229,24 @@ pub fn detect_dupes(options: Opt, progress: Option>) -> Vec let minimum = options.minimum.unwrap_or(1); - maybe_send_progress(&progress, "Culling by filesizes"); let paths = cull_by_filesize(paths, minimum); if options.debug { println!("{} potential dupes after filesize cull", paths.len()); } - maybe_send_progress(&progress, "Culling by start"); let paths = cull_by_start(paths); if options.debug { println!("{} potential dupes after start cull", paths.len()); } - maybe_send_progress(&progress, "Culling by hash"); let paths = cull_by_hash(paths); if options.debug { println!("{} dupes after full file hashing", paths.len()); } - maybe_send_progress(&progress, "Formatting"); let output_strings = format_results(paths); if options.timing { From 80e5d2ce965f02173f4530f8ccf9b14f5a821771 Mon Sep 17 00:00:00 2001 From: Justin Date: Wed, 6 Sep 2023 18:01:02 +1000 Subject: [PATCH 07/17] more cleanup --- Cargo.toml | 1 - src/lib.rs | 37 +++++++++++++++++++++++++++++++------ 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index fa1491a..eef0dcd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,7 +14,6 @@ walkdir = "2" rayon = "1" seahash = "4" clap = { version = "4.4.2", features = ["derive"] } - [lib] name = "dupelib" path = "src/lib.rs" diff --git a/src/lib.rs b/src/lib.rs index 745e1e9..6de6b78 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -43,7 +43,7 @@ fn hash_file(path: &DirEntry) -> BoxResult { let mut hasher = SeaHasher::new(); let mut reader = BufReader::new(file); - let mut buffer = [0; 64000]; + let mut buffer = vec![0; 512000]; loop { let count = reader.read(&mut buffer)?; @@ -118,12 +118,37 @@ fn walk_dirs(input: Vec) -> Vec { } fn cull_by_filesize(input: Vec, minimum: u64) -> Vec { + let input: Vec<_> = input + .par_iter() + .cloned() + .filter_map(|mut candidate| { + let current_path = &candidate.path; + if let Ok(bytes_count) = byte_count_file(¤t_path) { + if bytes_count >= minimum { + candidate.size = Some(bytes_count); + return Some(candidate) + } + } + None + }) + .collect(); + + let mut hashes = HashSet::new(); + let mut dupe_hashes = HashSet::new(); + for candidate in &input { + if let Some(hash) = candidate.size { + if hashes.contains(&hash) { + dupe_hashes.insert(hash); + } else { + hashes.insert(hash); + } + } + } + let mut out = Vec::new(); - for mut candidate in input { - let current_path = &candidate.path; - if let Ok(bytes_count) = byte_count_file(¤t_path) { - if bytes_count >= minimum { - candidate.size = Some(bytes_count); + for candidate in input { + if let Some(hash) = candidate.size { + if dupe_hashes.contains(&hash) { out.push(candidate) } } From ca393976211f8d1d83d1b72a937a94c24c4610b7 Mon Sep 17 00:00:00 2001 From: Justin Date: Wed, 6 Sep 2023 18:02:08 +1000 Subject: [PATCH 08/17] fmt --- src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 6de6b78..9df0ed1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,6 +3,7 @@ extern crate rayon; extern crate walkdir; use clap::Parser; use rayon::prelude::*; +use seahash::SeaHasher; use std::collections::HashSet; use std::error::Error; use std::fs::File; @@ -13,7 +14,6 @@ use std::path::PathBuf; use std::time::Instant; use walkdir::DirEntry; use walkdir::WalkDir; -use seahash::SeaHasher; #[derive(Parser, Debug, Default)] pub struct Opt { @@ -126,7 +126,7 @@ fn cull_by_filesize(input: Vec, minimum: u64) -> Vec= minimum { candidate.size = Some(bytes_count); - return Some(candidate) + return Some(candidate); } } None From 7fab7291abed2882b5b71bb8a3548d9ddc0fbc10 Mon Sep 17 00:00:00 2001 From: Justin Date: Wed, 6 Sep 2023 18:34:34 +1000 Subject: [PATCH 09/17] fixup --- src/lib.rs | 2 +- tests/integration_test.rs | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 9df0ed1..fbec72f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -199,7 +199,7 @@ fn cull_by_hash(input: Vec) -> Vec { .map(|mut candidate| { let current_path = &candidate.path; if let Ok(hash) = hash_file(current_path) { - candidate.start_hash = Some(hash); + candidate.full_hash = Some(hash); } candidate }) diff --git a/tests/integration_test.rs b/tests/integration_test.rs index 28769ec..2712536 100644 --- a/tests/integration_test.rs +++ b/tests/integration_test.rs @@ -13,7 +13,7 @@ fn test_case_dir(case: &str) -> PathBuf { #[test] fn test_base_case() { let options = Default::default(); - assert_eq!(dupelib::detect_dupes(options, None).len(), 0); + assert_eq!(dupelib::detect_dupes(options).len(), 0); } #[test] @@ -23,7 +23,7 @@ fn test_one_file() { paths: vec![path], ..Default::default() }; - assert_eq!(dupelib::detect_dupes(options, None).len(), 0); + assert_eq!(dupelib::detect_dupes(options).len(), 0); } #[test] @@ -33,7 +33,7 @@ fn test_ident_files() { paths: vec![path], ..Default::default() }; - assert_eq!(dupelib::detect_dupes(options, None).len(), 2); + assert_eq!(dupelib::detect_dupes(options).len(), 2); } #[test] @@ -44,5 +44,5 @@ fn test_ident_files_minimum() { minimum: Some(2), ..Default::default() }; - assert_eq!(dupelib::detect_dupes(options, None).len(), 1); + assert_eq!(dupelib::detect_dupes(options).len(), 1); } From 18b41e267c29f3b394c42568f93bb6b52f3956ee Mon Sep 17 00:00:00 2001 From: Justin Date: Wed, 6 Sep 2023 20:12:17 +1000 Subject: [PATCH 10/17] better format --- Cargo.lock | 7 +++++++ Cargo.toml | 1 + Cross.toml | 1 - src/bin/cli.rs | 3 +-- src/lib.rs | 36 ++++++++++++++++++++++++++++++------ 5 files changed, 39 insertions(+), 9 deletions(-) delete mode 100644 Cross.toml diff --git a/Cargo.lock b/Cargo.lock index a410998..447fd3d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -83,6 +83,12 @@ version = "3.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1" +[[package]] +name = "bytesize" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3e368af43e418a04d52505cf3dbc23dda4e3407ae2fa99fd0e4f308ce546acc" + [[package]] name = "cast" version = "0.3.0" @@ -260,6 +266,7 @@ dependencies = [ name = "dedupe" version = "3.2.1" dependencies = [ + "bytesize", "clap", "criterion", "rayon", diff --git a/Cargo.toml b/Cargo.toml index eef0dcd..d5f3474 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,6 +14,7 @@ walkdir = "2" rayon = "1" seahash = "4" clap = { version = "4.4.2", features = ["derive"] } +bytesize = "1.3.0" [lib] name = "dupelib" path = "src/lib.rs" diff --git a/Cross.toml b/Cross.toml deleted file mode 100644 index b3dc6fc..0000000 --- a/Cross.toml +++ /dev/null @@ -1 +0,0 @@ -[target.armv7-unknown-linux-gnueabihf] diff --git a/src/bin/cli.rs b/src/bin/cli.rs index 83721e2..4414abc 100644 --- a/src/bin/cli.rs +++ b/src/bin/cli.rs @@ -7,8 +7,7 @@ use dupelib::detect_dupes; use dupelib::Opt; fn run_dupe_detect(options: Opt) { - let dupes = detect_dupes(options); - println!("{:?}", dupes); + detect_dupes(options); } fn main() { diff --git a/src/lib.rs b/src/lib.rs index fbec72f..09e2dd1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,7 @@ extern crate clap; extern crate rayon; extern crate walkdir; +extern crate bytesize; use clap::Parser; use rayon::prelude::*; use seahash::SeaHasher; @@ -227,13 +228,36 @@ fn cull_by_hash(input: Vec) -> Vec { } out } - -fn format_results(input: Vec) -> Vec { - let mut out = Vec::new(); +use std::cmp::Ordering; + +use bytesize::ByteSize; +fn format_results(mut input: Vec) -> () { + input.sort_unstable_by(|a, b|{ + let size_cmp = b.size.partial_cmp(&a.size).unwrap(); + if size_cmp != Ordering::Equal { + return size_cmp + } + let hash_cmp = b.full_hash.partial_cmp(&a.full_hash).unwrap(); + if hash_cmp != Ordering::Equal { + return hash_cmp + } + format!("{}",b.path.path().display()).partial_cmp(&format!("{}",a.path.path().display())).unwrap() + }); + let mut last_size : u64 = 0; + let mut last_hash : u64 = 0; for item in input { - out.push(format!("{}: \n", item.path.path().display())) + let hash = item.full_hash.unwrap(); + if hash != last_hash{ + println!("-------"); + last_hash = hash; + } + let size = item.size.unwrap(); + if size != last_size{ + println!("Size: {} ", ByteSize(size)); + last_size = size; + } + println!("Path: {} ", item.path.path().display()); } - out } #[derive(Clone)] @@ -244,7 +268,7 @@ struct CandidateFile { full_hash: Option, } -pub fn detect_dupes(options: Opt) -> Vec { +pub fn detect_dupes(options: Opt) -> () { let now = Instant::now(); let paths = walk_dirs(options.paths); From 5434490827c4d597fee1584c69b2e26a8ba7d8f0 Mon Sep 17 00:00:00 2001 From: Justin Date: Wed, 6 Sep 2023 20:33:45 +1000 Subject: [PATCH 11/17] better format --- src/bin/cli.rs | 2 +- src/lib.rs | 138 +++++++++++++++++++++++++------------------------ 2 files changed, 71 insertions(+), 69 deletions(-) diff --git a/src/bin/cli.rs b/src/bin/cli.rs index 4414abc..7866708 100644 --- a/src/bin/cli.rs +++ b/src/bin/cli.rs @@ -7,7 +7,7 @@ use dupelib::detect_dupes; use dupelib::Opt; fn run_dupe_detect(options: Opt) { - detect_dupes(options); + detect_dupes(options); } fn main() { diff --git a/src/lib.rs b/src/lib.rs index 09e2dd1..bb30e89 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,7 +1,7 @@ +extern crate bytesize; extern crate clap; extern crate rayon; extern crate walkdir; -extern crate bytesize; use clap::Parser; use rayon::prelude::*; use seahash::SeaHasher; @@ -107,18 +107,13 @@ fn walk_dirs(input: Vec) -> Vec { .collect(); let mut paths = Vec::new(); for entry in vec { - let item = CandidateFile { - path: entry, - size: None, - start_hash: None, - full_hash: None, - }; + let item = CandidateFile { path: entry }; paths.push(item); } paths } -fn cull_by_filesize(input: Vec, minimum: u64) -> Vec { +fn cull_by_filesize(input: Vec, minimum: u64) -> Vec { let input: Vec<_> = input .par_iter() .cloned() @@ -126,8 +121,11 @@ fn cull_by_filesize(input: Vec, minimum: u64) -> Vec= minimum { - candidate.size = Some(bytes_count); - return Some(candidate); + let res = CandidateFileWithSize { + path: candidate.path, + size: bytes_count, + }; + return Some(res); } } None @@ -137,93 +135,87 @@ fn cull_by_filesize(input: Vec, minimum: u64) -> Vec) -> Vec { +fn cull_by_start(input: Vec) -> Vec { let input: Vec<_> = input .par_iter() .cloned() - .map(|mut candidate| { + .filter_map(|mut candidate| { let current_path = &candidate.path; if let Ok(hash) = hash_start_file(current_path) { - candidate.start_hash = Some(hash); + let res = CandidateFileWithSizeAndHash { + path: candidate.path, + size: candidate.size, + hash: hash, + }; + return Some(res); } - candidate + None }) .collect(); let mut hashes = HashSet::new(); let mut dupe_hashes = HashSet::new(); for candidate in &input { - if let Some(hash) = candidate.start_hash { - if hashes.contains(&hash) { - dupe_hashes.insert(hash); - } else { - hashes.insert(hash); - } + if hashes.contains(&candidate.hash) { + dupe_hashes.insert(candidate.hash); + } else { + hashes.insert(candidate.hash); } } let mut out = Vec::new(); for candidate in input { - if let Some(hash) = candidate.start_hash { - if dupe_hashes.contains(&hash) { - out.push(candidate) - } + if dupe_hashes.contains(&candidate.hash) { + out.push(candidate) } } out } -fn cull_by_hash(input: Vec) -> Vec { +fn cull_by_hash(input: Vec) -> Vec { let input: Vec<_> = input .par_iter() .cloned() - .map(|mut candidate| { + .filter_map(|mut candidate| { let current_path = &candidate.path; if let Ok(hash) = hash_file(current_path) { - candidate.full_hash = Some(hash); + candidate.hash = hash; + return Some(candidate); } - candidate + None }) .collect(); let mut hashes = HashSet::new(); let mut dupe_hashes = HashSet::new(); for candidate in &input { - if let Some(hash) = candidate.full_hash { - if hashes.contains(&hash) { - dupe_hashes.insert(hash); - } else { - hashes.insert(hash); - } + if hashes.contains(&candidate.hash) { + dupe_hashes.insert(candidate.hash); + } else { + hashes.insert(candidate.hash); } } let mut out = Vec::new(); for candidate in input { - if let Some(hash) = candidate.full_hash { - if dupe_hashes.contains(&hash) { - out.push(candidate) - } + if dupe_hashes.contains(&candidate.hash) { + out.push(candidate) } } out @@ -231,28 +223,28 @@ fn cull_by_hash(input: Vec) -> Vec { use std::cmp::Ordering; use bytesize::ByteSize; -fn format_results(mut input: Vec) -> () { - input.sort_unstable_by(|a, b|{ - let size_cmp = b.size.partial_cmp(&a.size).unwrap(); - if size_cmp != Ordering::Equal { - return size_cmp - } - let hash_cmp = b.full_hash.partial_cmp(&a.full_hash).unwrap(); - if hash_cmp != Ordering::Equal { - return hash_cmp - } - format!("{}",b.path.path().display()).partial_cmp(&format!("{}",a.path.path().display())).unwrap() +fn format_results(mut input: Vec) -> () { + input.sort_unstable_by(|a, b| { + let size_cmp = b.size.cmp(&a.size); + if size_cmp != Ordering::Equal { + return size_cmp; + } + let hash_cmp = b.hash.cmp(&a.hash); + if hash_cmp != Ordering::Equal { + return hash_cmp; + } + format!("{}", b.path.path().display()).cmp(&format!("{}", a.path.path().display())) }); - let mut last_size : u64 = 0; - let mut last_hash : u64 = 0; + let mut last_size: u64 = 0; + let mut last_hash: u64 = 0; for item in input { - let hash = item.full_hash.unwrap(); - if hash != last_hash{ + let hash = item.hash; + if hash != last_hash { println!("-------"); last_hash = hash; } - let size = item.size.unwrap(); - if size != last_size{ + let size = item.size; + if size != last_size { println!("Size: {} ", ByteSize(size)); last_size = size; } @@ -263,9 +255,19 @@ fn format_results(mut input: Vec) -> () { #[derive(Clone)] struct CandidateFile { path: DirEntry, - size: Option, - start_hash: Option, - full_hash: Option, +} + +#[derive(Clone)] +struct CandidateFileWithSize { + path: DirEntry, + size: u64, +} + +#[derive(Clone)] +struct CandidateFileWithSizeAndHash { + path: DirEntry, + size: u64, + hash: u64, } pub fn detect_dupes(options: Opt) -> () { From f16e9c80ddb4bf803ea9cc06bb93486c7e000128 Mon Sep 17 00:00:00 2001 From: Justin Date: Wed, 6 Sep 2023 20:34:45 +1000 Subject: [PATCH 12/17] fixup --- src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index bb30e89..076ea1e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -117,7 +117,7 @@ fn cull_by_filesize(input: Vec, minimum: u64) -> Vec = input .par_iter() .cloned() - .filter_map(|mut candidate| { + .filter_map(|candidate| { let current_path = &candidate.path; if let Ok(bytes_count) = byte_count_file(¤t_path) { if bytes_count >= minimum { @@ -155,7 +155,7 @@ fn cull_by_start(input: Vec) -> Vec = input .par_iter() .cloned() - .filter_map(|mut candidate| { + .filter_map(|candidate| { let current_path = &candidate.path; if let Ok(hash) = hash_start_file(current_path) { let res = CandidateFileWithSizeAndHash { From a93f381d08676c18da57daa9d8452678708d9136 Mon Sep 17 00:00:00 2001 From: Justin Date: Wed, 6 Sep 2023 20:40:18 +1000 Subject: [PATCH 13/17] fix tests --- src/lib.rs | 8 ++++---- tests/integration_test.rs | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 076ea1e..69130bd 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -264,13 +264,13 @@ struct CandidateFileWithSize { } #[derive(Clone)] -struct CandidateFileWithSizeAndHash { +pub struct CandidateFileWithSizeAndHash { path: DirEntry, size: u64, hash: u64, } -pub fn detect_dupes(options: Opt) -> () { +pub fn detect_dupes(options: Opt) -> Vec { let now = Instant::now(); let paths = walk_dirs(options.paths); @@ -298,10 +298,10 @@ pub fn detect_dupes(options: Opt) -> () { println!("{} dupes after full file hashing", paths.len()); } - let output_strings = format_results(paths); + format_results(paths.clone()); if options.timing { print_timing_info(now); } - output_strings + paths } diff --git a/tests/integration_test.rs b/tests/integration_test.rs index 2712536..ec26668 100644 --- a/tests/integration_test.rs +++ b/tests/integration_test.rs @@ -33,7 +33,7 @@ fn test_ident_files() { paths: vec![path], ..Default::default() }; - assert_eq!(dupelib::detect_dupes(options).len(), 2); + assert_eq!(dupelib::detect_dupes(options).len(), 4); } #[test] @@ -44,5 +44,5 @@ fn test_ident_files_minimum() { minimum: Some(2), ..Default::default() }; - assert_eq!(dupelib::detect_dupes(options).len(), 1); + assert_eq!(dupelib::detect_dupes(options).len(), 2); } From b94d2cb7d034cfc18bb3a9fa23b99a9ab21282c2 Mon Sep 17 00:00:00 2001 From: Justin Date: Wed, 6 Sep 2023 22:10:32 +1000 Subject: [PATCH 14/17] no clones --- src/lib.rs | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 69130bd..9984045 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -115,14 +115,13 @@ fn walk_dirs(input: Vec) -> Vec { fn cull_by_filesize(input: Vec, minimum: u64) -> Vec { let input: Vec<_> = input - .par_iter() - .cloned() + .into_par_iter() .filter_map(|candidate| { - let current_path = &candidate.path; + let current_path = candidate.path; if let Ok(bytes_count) = byte_count_file(¤t_path) { if bytes_count >= minimum { let res = CandidateFileWithSize { - path: candidate.path, + path: current_path, size: bytes_count, }; return Some(res); @@ -131,7 +130,6 @@ fn cull_by_filesize(input: Vec, minimum: u64) -> Vec, minimum: u64) -> Vec) -> Vec { let input: Vec<_> = input - .par_iter() - .cloned() + .into_par_iter() .filter_map(|candidate| { - let current_path = &candidate.path; - if let Ok(hash) = hash_start_file(current_path) { + let current_path = candidate.path; + if let Ok(hash) = hash_start_file(¤t_path) { let res = CandidateFileWithSizeAndHash { - path: candidate.path, + path: current_path, size: candidate.size, hash: hash, }; @@ -190,13 +187,16 @@ fn cull_by_start(input: Vec) -> Vec) -> Vec { let input: Vec<_> = input - .par_iter() - .cloned() - .filter_map(|mut candidate| { - let current_path = &candidate.path; - if let Ok(hash) = hash_file(current_path) { - candidate.hash = hash; - return Some(candidate); + .into_par_iter() + .filter_map(|candidate| { + let current_path = candidate.path; + if let Ok(hash) = hash_file(¤t_path) { + let res = CandidateFileWithSizeAndHash { + path: current_path, + size: candidate.size, + hash: hash, + }; + return Some(res); } None }) From 19bcae3cd3b3c6e5e64cd2fc58dcfe3ab2753f99 Mon Sep 17 00:00:00 2001 From: Justin Date: Wed, 6 Sep 2023 22:48:04 +1000 Subject: [PATCH 15/17] more cleanup --- src/lib.rs | 98 +++++++++++++++++++++++++++++------------------------- 1 file changed, 53 insertions(+), 45 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 9984045..07cbc8c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,9 +2,11 @@ extern crate bytesize; extern crate clap; extern crate rayon; extern crate walkdir; +use bytesize::ByteSize; use clap::Parser; use rayon::prelude::*; use seahash::SeaHasher; +use std::cmp::Ordering; use std::collections::HashSet; use std::error::Error; use std::fs::File; @@ -113,6 +115,38 @@ fn walk_dirs(input: Vec) -> Vec { paths } +struct DupeRecords +where + T: std::cmp::Eq + std::hash::Hash, +{ + hashes: HashSet, + dupe_hashes: HashSet, +} + +impl DupeRecords +where + T: std::cmp::Eq + std::hash::Hash, +{ + fn new() -> DupeRecords { + DupeRecords { + hashes: HashSet::::new(), + dupe_hashes: HashSet::::new(), + } + } + + fn load(&mut self, datum: T) { + if self.hashes.contains(&datum) { + self.dupe_hashes.insert(datum); + } else { + self.hashes.insert(datum); + } + } + + fn contains(&self, datum: T) -> bool { + return self.dupe_hashes.contains(&datum); + } +} + fn cull_by_filesize(input: Vec, minimum: u64) -> Vec { let input: Vec<_> = input .into_par_iter() @@ -130,23 +164,15 @@ fn cull_by_filesize(input: Vec, minimum: u64) -> Vec) -> Vec { @@ -166,23 +192,15 @@ fn cull_by_start(input: Vec) -> Vec) -> Vec { @@ -202,27 +220,17 @@ fn cull_by_hash(input: Vec) -> Vec) -> () { input.sort_unstable_by(|a, b| { let size_cmp = b.size.cmp(&a.size); From 06a75f46b42aa9174fd746aa59b5fc2b35889104 Mon Sep 17 00:00:00 2001 From: Justin Date: Wed, 6 Sep 2023 23:01:12 +1000 Subject: [PATCH 16/17] more cleanup --- src/lib.rs | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 07cbc8c..03dac3c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -18,7 +18,7 @@ use std::time::Instant; use walkdir::DirEntry; use walkdir::WalkDir; -#[derive(Parser, Debug, Default)] +#[derive(Parser, Default)] pub struct Opt { pub paths: Vec, @@ -96,23 +96,18 @@ fn print_timing_info(now: Instant) { } fn walk_dirs(input: Vec) -> Vec { - let vec: Vec = input + input .par_iter() .map(|path| { WalkDir::new(path) .into_iter() .filter_entry(|e| !is_hidden(e)) .filter_map(is_valid_file) - .collect::>() + .map(|entry| CandidateFile { path: entry }) + .collect::>() }) .flatten() - .collect(); - let mut paths = Vec::new(); - for entry in vec { - let item = CandidateFile { path: entry }; - paths.push(item); - } - paths + .collect() } struct DupeRecords @@ -231,7 +226,9 @@ fn cull_by_hash(input: Vec) -> Vec) -> () { +fn format_results( + mut input: Vec, +) -> Vec { input.sort_unstable_by(|a, b| { let size_cmp = b.size.cmp(&a.size); if size_cmp != Ordering::Equal { @@ -245,7 +242,7 @@ fn format_results(mut input: Vec) -> () { }); let mut last_size: u64 = 0; let mut last_hash: u64 = 0; - for item in input { + for item in &input { let hash = item.hash; if hash != last_hash { println!("-------"); @@ -258,20 +255,18 @@ fn format_results(mut input: Vec) -> () { } println!("Path: {} ", item.path.path().display()); } + return input; } -#[derive(Clone)] struct CandidateFile { path: DirEntry, } -#[derive(Clone)] struct CandidateFileWithSize { path: DirEntry, size: u64, } -#[derive(Clone)] pub struct CandidateFileWithSizeAndHash { path: DirEntry, size: u64, @@ -305,11 +300,8 @@ pub fn detect_dupes(options: Opt) -> Vec { if options.debug { println!("{} dupes after full file hashing", paths.len()); } - - format_results(paths.clone()); - if options.timing { print_timing_info(now); } - paths + format_results(paths) } From 970c20d07524e68ff13a6bcda71aaa20e351bc3a Mon Sep 17 00:00:00 2001 From: Justin Date: Wed, 6 Sep 2023 23:05:41 +1000 Subject: [PATCH 17/17] format --- src/lib.rs | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 03dac3c..a370ce2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -45,9 +45,7 @@ fn hash_file(path: &DirEntry) -> BoxResult { let file = File::open(path.path())?; let mut hasher = SeaHasher::new(); let mut reader = BufReader::new(file); - let mut buffer = vec![0; 512000]; - loop { let count = reader.read(&mut buffer)?; if count == 0 { @@ -55,7 +53,6 @@ fn hash_file(path: &DirEntry) -> BoxResult { } hasher.write(&buffer[..count]); } - Ok(hasher.finish()) } @@ -159,7 +156,6 @@ fn cull_by_filesize(input: Vec, minimum: u64) -> Vec) -> Vec) -> Vec Vec { let now = Instant::now(); let paths = walk_dirs(options.paths); - if options.debug { println!("{} files found ", paths.len()); } - let minimum = options.minimum.unwrap_or(1); - let paths = cull_by_filesize(paths, minimum); - if options.debug { println!("{} potential dupes after filesize cull", paths.len()); } - let paths = cull_by_start(paths); - if options.debug { println!("{} potential dupes after start cull", paths.len()); } - let paths = cull_by_hash(paths); - if options.debug { println!("{} dupes after full file hashing", paths.len()); }