From 8f6876d65dd06d3545cfe4bbe5a6faf790380fe4 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Fri, 29 Dec 2023 22:39:59 -0500 Subject: [PATCH 1/5] `cat`: add rowskey --group options; increased perf of rowskey --group is more than just a boolean option now it now accepts a grpkind parameter. Valid values for which are: * fullpath * parentdirfname * parentdirfstem * fname * fstem * none with a default of "none" this implements #1506 also made rowskey faster by using push_field() instead of write_field(). This should greatly reduce I/O operations, writing by row, rather than by field --- src/cmd/cat.rs | 134 ++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 109 insertions(+), 25 deletions(-) diff --git a/src/cmd/cat.rs b/src/cmd/cat.rs index af4e30c9c..27de60531 100644 --- a/src/cmd/cat.rs +++ b/src/cmd/cat.rs @@ -50,9 +50,13 @@ cat options: This is faster, but may result in invalid CSV data. ROWSKEY OPTIONS: - -g, --group When concatenating with rowskey, use the file stem of each - input file as a grouping value. A new column will be added - to the beginning of each row, using --group-name. + -g, --group When concatenating with rowskey, you can specify a grouping value + which will be used as the first column in the output. This is useful + when you want to know which file a row came from. Valid values are + 'fullpath', 'parentdirfname', 'parentdirfstem', 'fname', 'fstem' and 'none'. + A new column will be added to the beginning of each row using --group-name. + If 'none' is specified, no grouping column will be added. + [default: none] -N, --group-name When concatenating with rowskey, this flag provides the name for the new grouping column. [default: file] @@ -66,10 +70,14 @@ Common options: Must be a single character. (default: ,) "#; -use std::path::PathBuf; +use std::{ + path::{Path, PathBuf}, + str::FromStr, +}; use indexmap::{IndexMap, IndexSet}; use serde::Deserialize; +use strum_macros::EnumString; use tempfile; use crate::{ @@ -82,7 +90,7 @@ struct Args { cmd_rows: bool, cmd_rowskey: bool, cmd_columns: bool, - flag_group: bool, + flag_group: String, flag_group_name: String, arg_input: Vec, flag_pad: bool, @@ -92,6 +100,45 @@ struct Args { flag_delimiter: Option, } +#[derive(Debug, EnumString, PartialEq)] +#[strum(ascii_case_insensitive)] +enum GroupKind { + FullPath, + ParentDirFName, + ParentDirFStem, + FName, + FStem, + None, +} + +fn get_parentdir_and_file>(path: P, stem_only: bool) -> Option { + let path = path.as_ref(); + + let file_info = if stem_only { + path.file_stem() + } else { + path.file_name() + }; + + let file_name = file_info.and_then(|f| f.to_str()); + + let parent_dir = path + .parent() + .and_then(|p| p.to_str()) + .filter(|s| !s.is_empty()); + + match (parent_dir, file_name) { + (Some(parent_dir), Some(file_name)) => Some( + Path::new(parent_dir) + .join(file_name) + .to_string_lossy() + .into_owned(), + ), + (None, Some(file_name)) => Some(file_name.to_string()), + _ => None, + } +} + pub fn run(argv: &[&str]) -> CliResult<()> { let mut args: Args = util::get_args(USAGE, argv)?; @@ -168,9 +215,17 @@ impl Args { ); } + let Ok(group_kind) = GroupKind::from_str(&self.flag_group) else { + return fail_incorrectusage_clierror!( + "Invalid grouping value `{}`. Valid values are 'fullpath', 'parentdirfname', \ + 'parentdirfstem', 'fname', 'fstem' and 'none'.", + self.flag_group + ); + }; + let mut columns_global: AhashIndexSet> = AhashIndexSet::default(); - if self.flag_group { + if group_kind != GroupKind::None { columns_global.insert(self.flag_group_name.as_bytes().to_vec().into_boxed_slice()); } @@ -201,14 +256,15 @@ impl Args { // as we know that all columns are already in columns_global and we don't need to // validate that the number of columns are the same every time we write a row let mut wtr = Config::new(&self.flag_output).flexible(true).writer()?; + let mut new_row = csv::ByteRecord::with_capacity(500, num_columns_global); + for c in &columns_global { - wtr.write_field(c)?; + new_row.push_field(c); } - let empty_byte_record = csv::ByteRecord::new(); - wtr.write_byte_record(&empty_byte_record)?; + wtr.write_byte_record(&new_row)?; // amortize allocations - let mut grouping_value; + let mut grouping_value = String::new(); let mut conf_path; let mut rdr; let mut header: &csv::ByteRecord; @@ -241,35 +297,63 @@ impl Args { columns_of_this_file.insert(fi, n); } - // use the file stem as the grouping value - // safety: we know that this is a file path and if the file path - // is not valid utf8, we convert it to lossy utf8 - grouping_value = conf_path - .unwrap() - .file_stem() - .unwrap() - .to_string_lossy() - .to_string(); + // set grouping_value + // safety: we know that this is a valid file path and if the file path + // is not utf8, we convert it to lossy utf8 + match group_kind { + GroupKind::FullPath => { + grouping_value.clear(); + grouping_value + .push_str(&conf_path.unwrap().canonicalize().unwrap().to_string_lossy()); + }, + GroupKind::ParentDirFName => { + grouping_value.clear(); + // grouping_value.push_str(&get_parent_and_filename(&conf_path.unwrap()). + // unwrap()); + grouping_value + .push_str(&get_parentdir_and_file(conf_path.unwrap(), false).unwrap()); + }, + GroupKind::ParentDirFStem => { + grouping_value.clear(); + // grouping_value.push_str(&get_parent_and_filename(&conf_path.unwrap()). + // unwrap()); + grouping_value + .push_str(&get_parentdir_and_file(conf_path.unwrap(), true).unwrap()); + }, + GroupKind::FName => { + grouping_value.clear(); + grouping_value + .push_str(&conf_path.unwrap().file_name().unwrap().to_string_lossy()); + }, + GroupKind::FStem => { + grouping_value.clear(); + grouping_value + .push_str(&conf_path.unwrap().file_stem().unwrap().to_string_lossy()); + }, + GroupKind::None => {}, + } while rdr.read_byte_record(&mut row)? { + new_row.clear(); for (col_idx, c) in columns_global.iter().enumerate() { if let Some(idx) = columns_of_this_file.get(c) { if let Some(d) = row.get(*idx) { - wtr.write_field(d)?; + new_row.push_field(d); } else { - wtr.write_field(b"")?; + new_row.push_field(b""); } - } else if self.flag_group && col_idx == 0 { + } else if group_kind != GroupKind::None && col_idx == 0 { // we are in the first column, and --group is set // so we write the grouping value - wtr.write_field(&grouping_value)?; + new_row.push_field(grouping_value.as_bytes()); } else { - wtr.write_field(b"")?; + new_row.push_field(b""); } } - wtr.write_byte_record(&empty_byte_record)?; + wtr.write_byte_record(&new_row)?; } } + Ok(wtr.flush()?) } From 9adb4270623bdb32750e9ea5131f5cc54942b199 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Fri, 29 Dec 2023 22:40:27 -0500 Subject: [PATCH 2/5] add create_subdir() helper fn to create subdirs in tests --- tests/workdir.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/workdir.rs b/tests/workdir.rs index d601f676d..17d4d1381 100644 --- a/tests/workdir.rs +++ b/tests/workdir.rs @@ -252,6 +252,13 @@ impl Workdir { } Ok(()) } + + // create a subdirectory + pub fn create_subdir(&self, name: &str) -> io::Result<()> { + let mut path = self.dir.clone(); + path.push(name); + create_dir_all(path) + } } impl fmt::Debug for Workdir { From 314de1134e38c1d4352d81669eea22e90bd0df15 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Fri, 29 Dec 2023 22:41:11 -0500 Subject: [PATCH 3/5] `cat`: add rowskey --group options --- tests/test_cat.rs | 117 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 114 insertions(+), 3 deletions(-) diff --git a/tests/test_cat.rs b/tests/test_cat.rs index eb605ea05..dc4094ed3 100644 --- a/tests/test_cat.rs +++ b/tests/test_cat.rs @@ -256,7 +256,7 @@ fn cat_rowskey_grouping() { let mut cmd = wrk.command("cat"); cmd.arg("rowskey") - .arg("--group") + .args(["--group", "fstem"]) .arg("in1.csv") .arg("in2.csv") .arg("in3.csv"); @@ -275,6 +275,117 @@ fn cat_rowskey_grouping() { assert_eq!(got, expected); } +#[test] +fn cat_rowskey_grouping_parentdirfname() { + let wrk = Workdir::new("cat_rowskey_grouping_parentdirfname"); + wrk.create( + "in1.csv", + vec![ + svec!["a", "b", "c"], + svec!["1", "2", "3"], + svec!["2", "3", "4"], + ], + ); + + wrk.create_with_delim( + "in2.tsv", + vec![ + svec!["c", "a", "b"], + svec!["3", "1", "2"], + svec!["4", "2", "3"], + ], + b'\t', + ); + + // create a subdirectory and put in3.csv in it + let _ = wrk.create_subdir("testdir"); + + wrk.create( + "testdir/in3.csv", + vec![ + svec!["a", "b", "d", "c"], + svec!["1", "2", "4", "3"], + svec!["2", "3", "5", "4"], + svec!["z", "y", "w", "x"], + ], + ); + + let mut cmd = wrk.command("cat"); + cmd.arg("rowskey") + .args(["--group", "parentdirfname"]) + .arg("in1.csv") + .arg("in2.tsv") + .arg("testdir/in3.csv"); + + let got: Vec> = wrk.read_stdout(&mut cmd); + let expected = vec![ + svec!["file", "a", "b", "c", "d"], + svec!["in1.csv", "1", "2", "3", ""], + svec!["in1.csv", "2", "3", "4", ""], + svec!["in2.tsv", "1", "2", "3", ""], + svec!["in2.tsv", "2", "3", "4", ""], + svec!["testdir/in3.csv", "1", "2", "3", "4"], + svec!["testdir/in3.csv", "2", "3", "4", "5"], + svec!["testdir/in3.csv", "z", "y", "x", "w"], + ]; + assert_eq!(got, expected); +} + +#[test] +fn cat_rowskey_grouping_parentdirfstem() { + let wrk = Workdir::new("cat_rowskey_grouping_parentdirfstem"); + wrk.create( + "in1.csv", + vec![ + svec!["a", "b", "c"], + svec!["1", "2", "3"], + svec!["2", "3", "4"], + ], + ); + + wrk.create( + "in2.csv", + vec![ + svec!["c", "a", "b"], + svec!["3", "1", "2"], + svec!["4", "2", "3"], + ], + ); + + // create a subdirectory and put in3.csv in it + let _ = wrk.create_subdir("testdir"); + + wrk.create( + "testdir/in3.csv", + vec![ + svec!["a", "b", "d", "c"], + svec!["1", "2", "4", "3"], + svec!["2", "3", "5", "4"], + svec!["z", "y", "w", "x"], + ], + ); + + let mut cmd = wrk.command("cat"); + cmd.arg("rowskey") + .args(["--group", "parentdirfstem"]) + .arg("in1.csv") + .arg("in2.csv") + .arg("testdir/in3.csv"); + + let got: Vec> = wrk.read_stdout(&mut cmd); + let expected = vec![ + svec!["file", "a", "b", "c", "d"], + svec!["in1", "1", "2", "3", ""], + svec!["in1", "2", "3", "4", ""], + svec!["in2", "1", "2", "3", ""], + svec!["in2", "2", "3", "4", ""], + svec!["testdir/in3", "1", "2", "3", "4"], + svec!["testdir/in3", "2", "3", "4", "5"], + svec!["testdir/in3", "z", "y", "x", "w"], + ]; + assert_eq!(got, expected); +} + #[test] fn cat_rowskey_grouping_infile() { let wrk = Workdir::new("cat_rowskey_grouping_infile"); @@ -310,7 +421,7 @@ fn cat_rowskey_grouping_infile() { let mut cmd = wrk.command("cat"); cmd.arg("rowskey") - .arg("--group") + .args(["-g", "FStem"]) .arg("testdata.infile-list"); let got: Vec> = wrk.read_stdout(&mut cmd); @@ -360,7 +471,7 @@ fn cat_rowskey_grouping_customname() { let mut cmd = wrk.command("cat"); cmd.arg("rowskey") - .arg("--group") + .args(["--group", "fstem"]) .args(&["--group-name", "file group label"]) .arg("in1.csv") .arg("in2.csv") From 02e7f12c9c1778f33bf558b5a9a10542b7f4eb42 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Fri, 29 Dec 2023 23:21:20 -0500 Subject: [PATCH 4/5] `tests`: on Windows, the directory separator is backslash --- tests/test_cat.rs | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tests/test_cat.rs b/tests/test_cat.rs index dc4094ed3..f70bcb48f 100644 --- a/tests/test_cat.rs +++ b/tests/test_cat.rs @@ -318,6 +318,20 @@ fn cat_rowskey_grouping_parentdirfname() { .arg("testdir/in3.csv"); let got: Vec> = wrk.read_stdout(&mut cmd); + // on Windows, the directory separator is backslash, which is an escape character in CSV + // strings. So we get double backslashes in the output. + #[cfg(windows)] + let expected = vec![ + svec!["file", "a", "b", "c", "d"], + svec!["in1.csv", "1", "2", "3", ""], + svec!["in1.csv", "2", "3", "4", ""], + svec!["in2.tsv", "1", "2", "3", ""], + svec!["in2.tsv", "2", "3", "4", ""], + svec!["testdir\\in3.csv", "1", "2", "3", "4"], + svec!["testdir\\in3.csv", "2", "3", "4", "5"], + svec!["testdir\\in3.csv", "z", "y", "x", "w"], + ]; + #[cfg(not(windows))] let expected = vec![ svec!["file", "a", "b", "c", "d"], svec!["in1.csv", "1", "2", "3", ""], @@ -373,6 +387,20 @@ fn cat_rowskey_grouping_parentdirfstem() { .arg("testdir/in3.csv"); let got: Vec> = wrk.read_stdout(&mut cmd); + // on Windows, the directory separator is backslash, which is an escape character in CSV + // strings. So we get double backslashes in the output. + #[cfg(windows)] + let expected = vec![ + svec!["file", "a", "b", "c", "d"], + svec!["in1.csv", "1", "2", "3", ""], + svec!["in1.csv", "2", "3", "4", ""], + svec!["in2.csv", "1", "2", "3", ""], + svec!["in2.csv", "2", "3", "4", ""], + svec!["testdir\\in3.csv", "1", "2", "3", "4"], + svec!["testdir\\in3.csv", "2", "3", "4", "5"], + svec!["testdir\\in3.csv", "z", "y", "x", "w"], + ]; + #[cfg(not(windows))] let expected = vec![ svec!["file", "a", "b", "c", "d"], svec!["in1", "1", "2", "3", ""], From 8e2175de6730d1cf6ad891cb5b22bc5d5934c2d4 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Fri, 29 Dec 2023 23:31:06 -0500 Subject: [PATCH 5/5] ooops... c&p error --- tests/test_cat.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/test_cat.rs b/tests/test_cat.rs index f70bcb48f..a44255b85 100644 --- a/tests/test_cat.rs +++ b/tests/test_cat.rs @@ -392,13 +392,13 @@ fn cat_rowskey_grouping_parentdirfstem() { #[cfg(windows)] let expected = vec![ svec!["file", "a", "b", "c", "d"], - svec!["in1.csv", "1", "2", "3", ""], - svec!["in1.csv", "2", "3", "4", ""], - svec!["in2.csv", "1", "2", "3", ""], - svec!["in2.csv", "2", "3", "4", ""], - svec!["testdir\\in3.csv", "1", "2", "3", "4"], - svec!["testdir\\in3.csv", "2", "3", "4", "5"], - svec!["testdir\\in3.csv", "z", "y", "x", "w"], + svec!["in1", "1", "2", "3", ""], + svec!["in1", "2", "3", "4", ""], + svec!["in2", "1", "2", "3", ""], + svec!["in2", "2", "3", "4", ""], + svec!["testdir\\in3", "1", "2", "3", "4"], + svec!["testdir\\in3", "2", "3", "4", "5"], + svec!["testdir\\in3", "z", "y", "x", "w"], ]; #[cfg(not(windows))] let expected = vec![