From 8f6876d65dd06d3545cfe4bbe5a6faf790380fe4 Mon Sep 17 00:00:00 2001
From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com>
Date: Fri, 29 Dec 2023 22:39:59 -0500
Subject: [PATCH 1/5] `cat`: add rowskey --group options; increased perf of
 rowskey

--group is more than just a boolean option now
it now accepts a grpkind parameter. Valid values for which are:
* fullpath
* parentdirfname
* parentdirfstem
* fname
* fstem
* none
with a default of "none"

this implements #1506

also made rowskey faster by using push_field() instead of write_field(). This should greatly reduce I/O operations, writing by row, rather than by field
---
 src/cmd/cat.rs | 134 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 109 insertions(+), 25 deletions(-)
diff --git a/src/cmd/cat.rs b/src/cmd/cat.rs
index af4e30c9c..27de60531 100644
--- a/src/cmd/cat.rs
+++ b/src/cmd/cat.rs
@@ -50,9 +50,13 @@ cat options:
                              This is faster, but may result in invalid CSV data.
 
                              ROWSKEY OPTIONS:
-    -g, --group              When concatenating with rowskey, use the file stem of each
-                             input file as a grouping value. A new column will be added
-                             to the beginning of each row, using --group-name.
+    -g, --group <grpkind>    When concatenating with rowskey, you can specify a grouping value
+                             which will be used as the first column in the output. This is useful
+                             when you want to know which file a row came from. Valid values are
+                             'fullpath', 'parentdirfname', 'parentdirfstem', 'fname', 'fstem' and 'none'.
+                             A new column will be added to the beginning of each row using --group-name.
+                             If 'none' is specified, no grouping column will be added.
+                             [default: none]
     -N, --group-name <arg>   When concatenating with rowskey, this flag provides the name
                              for the new grouping column. [default: file]
                              
@@ -66,10 +70,14 @@ Common options:
                            Must be a single character. (default: ,)
 "#;
 
-use std::path::PathBuf;
+use std::{
+    path::{Path, PathBuf},
+    str::FromStr,
+};
 
 use indexmap::{IndexMap, IndexSet};
 use serde::Deserialize;
+use strum_macros::EnumString;
 use tempfile;
 
 use crate::{
@@ -82,7 +90,7 @@ struct Args {
     cmd_rows:        bool,
     cmd_rowskey:     bool,
     cmd_columns:     bool,
-    flag_group:      bool,
+    flag_group:      String,
     flag_group_name: String,
     arg_input:       Vec<PathBuf>,
     flag_pad:        bool,
@@ -92,6 +100,45 @@ struct Args {
     flag_delimiter:  Option<Delimiter>,
 }
 
+#[derive(Debug, EnumString, PartialEq)]
+#[strum(ascii_case_insensitive)]
+enum GroupKind {
+    FullPath,
+    ParentDirFName,
+    ParentDirFStem,
+    FName,
+    FStem,
+    None,
+}
+
+fn get_parentdir_and_file<P: AsRef<Path>>(path: P, stem_only: bool) -> Option<String> {
+    let path = path.as_ref();
+
+    let file_info = if stem_only {
+        path.file_stem()
+    } else {
+        path.file_name()
+    };
+
+    let file_name = file_info.and_then(|f| f.to_str());
+
+    let parent_dir = path
+        .parent()
+        .and_then(|p| p.to_str())
+        .filter(|s| !s.is_empty());
+
+    match (parent_dir, file_name) {
+        (Some(parent_dir), Some(file_name)) => Some(
+            Path::new(parent_dir)
+                .join(file_name)
+                .to_string_lossy()
+                .into_owned(),
+        ),
+        (None, Some(file_name)) => Some(file_name.to_string()),
+        _ => None,
+    }
+}
+
 pub fn run(argv: &[&str]) -> CliResult<()> {
     let mut args: Args = util::get_args(USAGE, argv)?;
 
@@ -168,9 +215,17 @@ impl Args {
             );
         }
 
+        let Ok(group_kind) = GroupKind::from_str(&self.flag_group) else {
+            return fail_incorrectusage_clierror!(
+                "Invalid grouping value `{}`. Valid values are 'fullpath', 'parentdirfname', \
+                 'parentdirfstem', 'fname', 'fstem' and 'none'.",
+                self.flag_group
+            );
+        };
+
         let mut columns_global: AhashIndexSet<Box<[u8]>> = AhashIndexSet::default();
 
-        if self.flag_group {
+        if group_kind != GroupKind::None {
             columns_global.insert(self.flag_group_name.as_bytes().to_vec().into_boxed_slice());
         }
 
@@ -201,14 +256,15 @@ impl Args {
         // as we know that all columns are already in columns_global and we don't need to
         // validate that the number of columns are the same every time we write a row
         let mut wtr = Config::new(&self.flag_output).flexible(true).writer()?;
+        let mut new_row = csv::ByteRecord::with_capacity(500, num_columns_global);
+
         for c in &columns_global {
-            wtr.write_field(c)?;
+            new_row.push_field(c);
         }
-        let empty_byte_record = csv::ByteRecord::new();
-        wtr.write_byte_record(&empty_byte_record)?;
+        wtr.write_byte_record(&new_row)?;
 
         // amortize allocations
-        let mut grouping_value;
+        let mut grouping_value = String::new();
         let mut conf_path;
         let mut rdr;
         let mut header: &csv::ByteRecord;
@@ -241,35 +297,63 @@ impl Args {
                 columns_of_this_file.insert(fi, n);
             }
 
-            // use the file stem as the grouping value
-            // safety: we know that this is a file path and if the file path
-            // is not valid utf8, we convert it to lossy utf8
-            grouping_value = conf_path
-                .unwrap()
-                .file_stem()
-                .unwrap()
-                .to_string_lossy()
-                .to_string();
+            // set grouping_value
+            // safety: we know that this is a valid file path and if the file path
+            // is not utf8, we convert it to lossy utf8
+            match group_kind {
+                GroupKind::FullPath => {
+                    grouping_value.clear();
+                    grouping_value
+                        .push_str(&conf_path.unwrap().canonicalize().unwrap().to_string_lossy());
+                },
+                GroupKind::ParentDirFName => {
+                    grouping_value.clear();
+                    // grouping_value.push_str(&get_parent_and_filename(&conf_path.unwrap()).
+                    // unwrap());
+                    grouping_value
+                        .push_str(&get_parentdir_and_file(conf_path.unwrap(), false).unwrap());
+                },
+                GroupKind::ParentDirFStem => {
+                    grouping_value.clear();
+                    // grouping_value.push_str(&get_parent_and_filename(&conf_path.unwrap()).
+                    // unwrap());
+                    grouping_value
+                        .push_str(&get_parentdir_and_file(conf_path.unwrap(), true).unwrap());
+                },
+                GroupKind::FName => {
+                    grouping_value.clear();
+                    grouping_value
+                        .push_str(&conf_path.unwrap().file_name().unwrap().to_string_lossy());
+                },
+                GroupKind::FStem => {
+                    grouping_value.clear();
+                    grouping_value
+                        .push_str(&conf_path.unwrap().file_stem().unwrap().to_string_lossy());
+                },
+                GroupKind::None => {},
+            }
 
             while rdr.read_byte_record(&mut row)? {
+                new_row.clear();
                 for (col_idx, c) in columns_global.iter().enumerate() {
                     if let Some(idx) = columns_of_this_file.get(c) {
                         if let Some(d) = row.get(*idx) {
-                            wtr.write_field(d)?;
+                            new_row.push_field(d);
                         } else {
-                            wtr.write_field(b"")?;
+                            new_row.push_field(b"");
                         }
-                    } else if self.flag_group && col_idx == 0 {
+                    } else if group_kind != GroupKind::None && col_idx == 0 {
                         // we are in the first column, and --group is set
                         // so we write the grouping value
-                        wtr.write_field(&grouping_value)?;
+                        new_row.push_field(grouping_value.as_bytes());
                     } else {
-                        wtr.write_field(b"")?;
+                        new_row.push_field(b"");
                     }
                 }
-                wtr.write_byte_record(&empty_byte_record)?;
+                wtr.write_byte_record(&new_row)?;
             }
         }
+
         Ok(wtr.flush()?)
     }
 

From 9adb4270623bdb32750e9ea5131f5cc54942b199 Mon Sep 17 00:00:00 2001
From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com>
Date: Fri, 29 Dec 2023 22:40:27 -0500
Subject: [PATCH 2/5] add create_subdir() helper fn to create subdirs in tests

---
 tests/workdir.rs | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tests/workdir.rs b/tests/workdir.rs
index d601f676d..17d4d1381 100644
--- a/tests/workdir.rs
+++ b/tests/workdir.rs
@@ -252,6 +252,13 @@ impl Workdir {
         }
         Ok(())
     }
+
+    // create a subdirectory
+    pub fn create_subdir(&self, name: &str) -> io::Result<()> {
+        let mut path = self.dir.clone();
+        path.push(name);
+        create_dir_all(path)
+    }
 }
 
 impl fmt::Debug for Workdir {

From 314de1134e38c1d4352d81669eea22e90bd0df15 Mon Sep 17 00:00:00 2001
From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com>
Date: Fri, 29 Dec 2023 22:41:11 -0500
Subject: [PATCH 3/5] `cat`: add rowskey --group options

---
 tests/test_cat.rs | 117 ++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 114 insertions(+), 3 deletions(-)

diff --git a/tests/test_cat.rs b/tests/test_cat.rs
index eb605ea05..dc4094ed3 100644
--- a/tests/test_cat.rs
+++ b/tests/test_cat.rs
@@ -256,7 +256,7 @@ fn cat_rowskey_grouping() {
 
     let mut cmd = wrk.command("cat");
     cmd.arg("rowskey")
-        .arg("--group")
+        .args(["--group", "fstem"])
         .arg("in1.csv")
         .arg("in2.csv")
         .arg("in3.csv");
@@ -275,6 +275,117 @@ fn cat_rowskey_grouping() {
     assert_eq!(got, expected);
 }
 
+#[test]
+fn cat_rowskey_grouping_parentdirfname() {
+    let wrk = Workdir::new("cat_rowskey_grouping_parentdirfname");
+    wrk.create(
+        "in1.csv",
+        vec![
+            svec!["a", "b", "c"],
+            svec!["1", "2", "3"],
+            svec!["2", "3", "4"],
+        ],
+    );
+
+    wrk.create_with_delim(
+        "in2.tsv",
+        vec![
+            svec!["c", "a", "b"],
+            svec!["3", "1", "2"],
+            svec!["4", "2", "3"],
+        ],
+        b'\t',
+    );
+
+    // create a subdirectory and put in3.csv in it
+    let _ = wrk.create_subdir("testdir");
+
+    wrk.create(
+        "testdir/in3.csv",
+        vec![
+            svec!["a", "b", "d", "c"],
+            svec!["1", "2", "4", "3"],
+            svec!["2", "3", "5", "4"],
+            svec!["z", "y", "w", "x"],
+        ],
+    );
+
+    let mut cmd = wrk.command("cat");
+    cmd.arg("rowskey")
+        .args(["--group", "parentdirfname"])
+        .arg("in1.csv")
+        .arg("in2.tsv")
+        .arg("testdir/in3.csv");
+
+    let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
+    let expected = vec![
+        svec!["file", "a", "b", "c", "d"],
+        svec!["in1.csv", "1", "2", "3", ""],
+        svec!["in1.csv", "2", "3", "4", ""],
+        svec!["in2.tsv", "1", "2", "3", ""],
+        svec!["in2.tsv", "2", "3", "4", ""],
+        svec!["testdir/in3.csv", "1", "2", "3", "4"],
+        svec!["testdir/in3.csv", "2", "3", "4", "5"],
+        svec!["testdir/in3.csv", "z", "y", "x", "w"],
+    ];
+    assert_eq!(got, expected);
+}
+
+#[test]
+fn cat_rowskey_grouping_parentdirfstem() {
+    let wrk = Workdir::new("cat_rowskey_grouping_parentdirfstem");
+    wrk.create(
+        "in1.csv",
+        vec![
+            svec!["a", "b", "c"],
+            svec!["1", "2", "3"],
+            svec!["2", "3", "4"],
+        ],
+    );
+
+    wrk.create(
+        "in2.csv",
+        vec![
+            svec!["c", "a", "b"],
+            svec!["3", "1", "2"],
+            svec!["4", "2", "3"],
+        ],
+    );
+
+    // create a subdirectory and put in3.csv in it
+    let _ = wrk.create_subdir("testdir");
+
+    wrk.create(
+        "testdir/in3.csv",
+        vec![
+            svec!["a", "b", "d", "c"],
+            svec!["1", "2", "4", "3"],
+            svec!["2", "3", "5", "4"],
+            svec!["z", "y", "w", "x"],
+        ],
+    );
+
+    let mut cmd = wrk.command("cat");
+    cmd.arg("rowskey")
+        .args(["--group", "parentdirfstem"])
+        .arg("in1.csv")
+        .arg("in2.csv")
+        .arg("testdir/in3.csv");
+
+    let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
+    let expected = vec![
+        svec!["file", "a", "b", "c", "d"],
+        svec!["in1", "1", "2", "3", ""],
+        svec!["in1", "2", "3", "4", ""],
+        svec!["in2", "1", "2", "3", ""],
+        svec!["in2", "2", "3", "4", ""],
+        svec!["testdir/in3", "1", "2", "3", "4"],
+        svec!["testdir/in3", "2", "3", "4", "5"],
+        svec!["testdir/in3", "z", "y", "x", "w"],
+    ];
+    assert_eq!(got, expected);
+}
+
 #[test]
 fn cat_rowskey_grouping_infile() {
     let wrk = Workdir::new("cat_rowskey_grouping_infile");
@@ -310,7 +421,7 @@ fn cat_rowskey_grouping_infile() {
 
     let mut cmd = wrk.command("cat");
     cmd.arg("rowskey")
-        .arg("--group")
+        .args(["-g", "FStem"])
         .arg("testdata.infile-list");
 
     let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
@@ -360,7 +471,7 @@ fn cat_rowskey_grouping_customname() {
 
     let mut cmd = wrk.command("cat");
     cmd.arg("rowskey")
-        .arg("--group")
+        .args(["--group", "fstem"])
         .args(&["--group-name", "file group label"])
         .arg("in1.csv")
         .arg("in2.csv")

From 02e7f12c9c1778f33bf558b5a9a10542b7f4eb42 Mon Sep 17 00:00:00 2001
From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com>
Date: Fri, 29 Dec 2023 23:21:20 -0500
Subject: [PATCH 4/5] `tests`: on Windows, the directory separator is backslash

---
 tests/test_cat.rs | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/tests/test_cat.rs b/tests/test_cat.rs
index dc4094ed3..f70bcb48f 100644
--- a/tests/test_cat.rs
+++ b/tests/test_cat.rs
@@ -318,6 +318,20 @@ fn cat_rowskey_grouping_parentdirfname() {
         .arg("testdir/in3.csv");
 
     let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
+    // on Windows, the directory separator is backslash, which is an escape character in CSV
+    // strings. So we get double backslashes in the output.
+    #[cfg(windows)]
+    let expected = vec![
+        svec!["file", "a", "b", "c", "d"],
+        svec!["in1.csv", "1", "2", "3", ""],
+        svec!["in1.csv", "2", "3", "4", ""],
+        svec!["in2.tsv", "1", "2", "3", ""],
+        svec!["in2.tsv", "2", "3", "4", ""],
+        svec!["testdir\\in3.csv", "1", "2", "3", "4"],
+        svec!["testdir\\in3.csv", "2", "3", "4", "5"],
+        svec!["testdir\\in3.csv", "z", "y", "x", "w"],
+    ];
+    #[cfg(not(windows))]
     let expected = vec![
         svec!["file", "a", "b", "c", "d"],
         svec!["in1.csv", "1", "2", "3", ""],
@@ -373,6 +387,20 @@ fn cat_rowskey_grouping_parentdirfstem() {
         .arg("testdir/in3.csv");
 
     let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
+    // on Windows, the directory separator is backslash, which is an escape character in CSV
+    // strings. So we get double backslashes in the output.
+    #[cfg(windows)]
+    let expected = vec![
+        svec!["file", "a", "b", "c", "d"],
+        svec!["in1.csv", "1", "2", "3", ""],
+        svec!["in1.csv", "2", "3", "4", ""],
+        svec!["in2.csv", "1", "2", "3", ""],
+        svec!["in2.csv", "2", "3", "4", ""],
+        svec!["testdir\\in3.csv", "1", "2", "3", "4"],
+        svec!["testdir\\in3.csv", "2", "3", "4", "5"],
+        svec!["testdir\\in3.csv", "z", "y", "x", "w"],
+    ];
+    #[cfg(not(windows))]
     let expected = vec![
         svec!["file", "a", "b", "c", "d"],
         svec!["in1", "1", "2", "3", ""],

From 8e2175de6730d1cf6ad891cb5b22bc5d5934c2d4 Mon Sep 17 00:00:00 2001
From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com>
Date: Fri, 29 Dec 2023 23:31:06 -0500
Subject: [PATCH 5/5] ooops... c&p error

---
 tests/test_cat.rs | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/test_cat.rs b/tests/test_cat.rs
index f70bcb48f..a44255b85 100644
--- a/tests/test_cat.rs
+++ b/tests/test_cat.rs
@@ -392,13 +392,13 @@ fn cat_rowskey_grouping_parentdirfstem() {
     #[cfg(windows)]
     let expected = vec![
         svec!["file", "a", "b", "c", "d"],
-        svec!["in1.csv", "1", "2", "3", ""],
-        svec!["in1.csv", "2", "3", "4", ""],
-        svec!["in2.csv", "1", "2", "3", ""],
-        svec!["in2.csv", "2", "3", "4", ""],
-        svec!["testdir\\in3.csv", "1", "2", "3", "4"],
-        svec!["testdir\\in3.csv", "2", "3", "4", "5"],
-        svec!["testdir\\in3.csv", "z", "y", "x", "w"],
+        svec!["in1", "1", "2", "3", ""],
+        svec!["in1", "2", "3", "4", ""],
+        svec!["in2", "1", "2", "3", ""],
+        svec!["in2", "2", "3", "4", ""],
+        svec!["testdir\\in3", "1", "2", "3", "4"],
+        svec!["testdir\\in3", "2", "3", "4", "5"],
+        svec!["testdir\\in3", "z", "y", "x", "w"],
     ];
     #[cfg(not(windows))]
     let expected = vec![