Skip to content

Commit

Permalink
Merge pull request #1508 from jqnatividad/1506-rowskey-group-options
Browse files Browse the repository at this point in the history
`cat`: add rowskey --group options; increased perf of rowskey
  • Loading branch information
jqnatividad authored Dec 30, 2023
2 parents 2fd23ee + 8e2175d commit 4fbd53c
Show file tree
Hide file tree
Showing 3 changed files with 258 additions and 28 deletions.
134 changes: 109 additions & 25 deletions src/cmd/cat.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,13 @@ cat options:
This is faster, but may result in invalid CSV data.
ROWSKEY OPTIONS:
-g, --group When concatenating with rowskey, use the file stem of each
input file as a grouping value. A new column will be added
to the beginning of each row, using --group-name.
-g, --group <grpkind> When concatenating with rowskey, you can specify a grouping value
which will be used as the first column in the output. This is useful
when you want to know which file a row came from. Valid values are
'fullpath', 'parentdirfname', 'parentdirfstem', 'fname', 'fstem' and 'none'.
A new column will be added to the beginning of each row using --group-name.
If 'none' is specified, no grouping column will be added.
[default: none]
-N, --group-name <arg> When concatenating with rowskey, this flag provides the name
for the new grouping column. [default: file]
Expand All @@ -66,10 +70,14 @@ Common options:
Must be a single character. (default: ,)
"#;

use std::path::PathBuf;
use std::{
path::{Path, PathBuf},
str::FromStr,
};

use indexmap::{IndexMap, IndexSet};
use serde::Deserialize;
use strum_macros::EnumString;
use tempfile;

use crate::{
Expand All @@ -82,7 +90,7 @@ struct Args {
cmd_rows: bool,
cmd_rowskey: bool,
cmd_columns: bool,
flag_group: bool,
flag_group: String,
flag_group_name: String,
arg_input: Vec<PathBuf>,
flag_pad: bool,
Expand All @@ -92,6 +100,45 @@ struct Args {
flag_delimiter: Option<Delimiter>,
}

#[derive(Debug, EnumString, PartialEq)]
#[strum(ascii_case_insensitive)]
enum GroupKind {
FullPath,
ParentDirFName,
ParentDirFStem,
FName,
FStem,
None,
}

fn get_parentdir_and_file<P: AsRef<Path>>(path: P, stem_only: bool) -> Option<String> {
let path = path.as_ref();

let file_info = if stem_only {
path.file_stem()
} else {
path.file_name()
};

let file_name = file_info.and_then(|f| f.to_str());

let parent_dir = path
.parent()
.and_then(|p| p.to_str())
.filter(|s| !s.is_empty());

match (parent_dir, file_name) {
(Some(parent_dir), Some(file_name)) => Some(
Path::new(parent_dir)
.join(file_name)
.to_string_lossy()
.into_owned(),
),
(None, Some(file_name)) => Some(file_name.to_string()),
_ => None,
}
}

pub fn run(argv: &[&str]) -> CliResult<()> {
let mut args: Args = util::get_args(USAGE, argv)?;

Expand Down Expand Up @@ -168,9 +215,17 @@ impl Args {
);
}

let Ok(group_kind) = GroupKind::from_str(&self.flag_group) else {
return fail_incorrectusage_clierror!(
"Invalid grouping value `{}`. Valid values are 'fullpath', 'parentdirfname', \
'parentdirfstem', 'fname', 'fstem' and 'none'.",
self.flag_group
);
};

let mut columns_global: AhashIndexSet<Box<[u8]>> = AhashIndexSet::default();

if self.flag_group {
if group_kind != GroupKind::None {
columns_global.insert(self.flag_group_name.as_bytes().to_vec().into_boxed_slice());
}

Expand Down Expand Up @@ -201,14 +256,15 @@ impl Args {
// as we know that all columns are already in columns_global and we don't need to
// validate that the number of columns are the same every time we write a row
let mut wtr = Config::new(&self.flag_output).flexible(true).writer()?;
let mut new_row = csv::ByteRecord::with_capacity(500, num_columns_global);

for c in &columns_global {
wtr.write_field(c)?;
new_row.push_field(c);
}
let empty_byte_record = csv::ByteRecord::new();
wtr.write_byte_record(&empty_byte_record)?;
wtr.write_byte_record(&new_row)?;

// amortize allocations
let mut grouping_value;
let mut grouping_value = String::new();
let mut conf_path;
let mut rdr;
let mut header: &csv::ByteRecord;
Expand Down Expand Up @@ -241,35 +297,63 @@ impl Args {
columns_of_this_file.insert(fi, n);
}

// use the file stem as the grouping value
// safety: we know that this is a file path and if the file path
// is not valid utf8, we convert it to lossy utf8
grouping_value = conf_path
.unwrap()
.file_stem()
.unwrap()
.to_string_lossy()
.to_string();
// set grouping_value
// safety: we know that this is a valid file path and if the file path
// is not utf8, we convert it to lossy utf8
match group_kind {
GroupKind::FullPath => {
grouping_value.clear();
grouping_value
.push_str(&conf_path.unwrap().canonicalize().unwrap().to_string_lossy());
},
GroupKind::ParentDirFName => {
grouping_value.clear();
// grouping_value.push_str(&get_parent_and_filename(&conf_path.unwrap()).
// unwrap());
grouping_value
.push_str(&get_parentdir_and_file(conf_path.unwrap(), false).unwrap());
},
GroupKind::ParentDirFStem => {
grouping_value.clear();
// grouping_value.push_str(&get_parent_and_filename(&conf_path.unwrap()).
// unwrap());
grouping_value
.push_str(&get_parentdir_and_file(conf_path.unwrap(), true).unwrap());
},
GroupKind::FName => {
grouping_value.clear();
grouping_value
.push_str(&conf_path.unwrap().file_name().unwrap().to_string_lossy());
},
GroupKind::FStem => {
grouping_value.clear();
grouping_value
.push_str(&conf_path.unwrap().file_stem().unwrap().to_string_lossy());
},
GroupKind::None => {},
}

while rdr.read_byte_record(&mut row)? {
new_row.clear();
for (col_idx, c) in columns_global.iter().enumerate() {
if let Some(idx) = columns_of_this_file.get(c) {
if let Some(d) = row.get(*idx) {
wtr.write_field(d)?;
new_row.push_field(d);
} else {
wtr.write_field(b"")?;
new_row.push_field(b"");
}
} else if self.flag_group && col_idx == 0 {
} else if group_kind != GroupKind::None && col_idx == 0 {
// we are in the first column, and --group is set
// so we write the grouping value
wtr.write_field(&grouping_value)?;
new_row.push_field(grouping_value.as_bytes());
} else {
wtr.write_field(b"")?;
new_row.push_field(b"");
}
}
wtr.write_byte_record(&empty_byte_record)?;
wtr.write_byte_record(&new_row)?;
}
}

Ok(wtr.flush()?)
}

Expand Down
145 changes: 142 additions & 3 deletions tests/test_cat.rs
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ fn cat_rowskey_grouping() {

let mut cmd = wrk.command("cat");
cmd.arg("rowskey")
.arg("--group")
.args(["--group", "fstem"])
.arg("in1.csv")
.arg("in2.csv")
.arg("in3.csv");
Expand All @@ -275,6 +275,145 @@ fn cat_rowskey_grouping() {
assert_eq!(got, expected);
}

#[test]
fn cat_rowskey_grouping_parentdirfname() {
let wrk = Workdir::new("cat_rowskey_grouping_parentdirfname");
wrk.create(
"in1.csv",
vec![
svec!["a", "b", "c"],
svec!["1", "2", "3"],
svec!["2", "3", "4"],
],
);

wrk.create_with_delim(
"in2.tsv",
vec![
svec!["c", "a", "b"],
svec!["3", "1", "2"],
svec!["4", "2", "3"],
],
b'\t',
);

// create a subdirectory and put in3.csv in it
let _ = wrk.create_subdir("testdir");

wrk.create(
"testdir/in3.csv",
vec![
svec!["a", "b", "d", "c"],
svec!["1", "2", "4", "3"],
svec!["2", "3", "5", "4"],
svec!["z", "y", "w", "x"],
],
);

let mut cmd = wrk.command("cat");
cmd.arg("rowskey")
.args(["--group", "parentdirfname"])
.arg("in1.csv")
.arg("in2.tsv")
.arg("testdir/in3.csv");

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
// on Windows, the directory separator is backslash, which is an escape character in CSV
// strings. So we get double backslashes in the output.
#[cfg(windows)]
let expected = vec![
svec!["file", "a", "b", "c", "d"],
svec!["in1.csv", "1", "2", "3", ""],
svec!["in1.csv", "2", "3", "4", ""],
svec!["in2.tsv", "1", "2", "3", ""],
svec!["in2.tsv", "2", "3", "4", ""],
svec!["testdir\\in3.csv", "1", "2", "3", "4"],
svec!["testdir\\in3.csv", "2", "3", "4", "5"],
svec!["testdir\\in3.csv", "z", "y", "x", "w"],
];
#[cfg(not(windows))]
let expected = vec![
svec!["file", "a", "b", "c", "d"],
svec!["in1.csv", "1", "2", "3", ""],
svec!["in1.csv", "2", "3", "4", ""],
svec!["in2.tsv", "1", "2", "3", ""],
svec!["in2.tsv", "2", "3", "4", ""],
svec!["testdir/in3.csv", "1", "2", "3", "4"],
svec!["testdir/in3.csv", "2", "3", "4", "5"],
svec!["testdir/in3.csv", "z", "y", "x", "w"],
];
assert_eq!(got, expected);
}

#[test]
fn cat_rowskey_grouping_parentdirfstem() {
let wrk = Workdir::new("cat_rowskey_grouping_parentdirfstem");
wrk.create(
"in1.csv",
vec![
svec!["a", "b", "c"],
svec!["1", "2", "3"],
svec!["2", "3", "4"],
],
);

wrk.create(
"in2.csv",
vec![
svec!["c", "a", "b"],
svec!["3", "1", "2"],
svec!["4", "2", "3"],
],
);

// create a subdirectory and put in3.csv in it
let _ = wrk.create_subdir("testdir");

wrk.create(
"testdir/in3.csv",
vec![
svec!["a", "b", "d", "c"],
svec!["1", "2", "4", "3"],
svec!["2", "3", "5", "4"],
svec!["z", "y", "w", "x"],
],
);

let mut cmd = wrk.command("cat");
cmd.arg("rowskey")
.args(["--group", "parentdirfstem"])
.arg("in1.csv")
.arg("in2.csv")
.arg("testdir/in3.csv");

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
// on Windows, the directory separator is backslash, which is an escape character in CSV
// strings. So we get double backslashes in the output.
#[cfg(windows)]
let expected = vec![
svec!["file", "a", "b", "c", "d"],
svec!["in1", "1", "2", "3", ""],
svec!["in1", "2", "3", "4", ""],
svec!["in2", "1", "2", "3", ""],
svec!["in2", "2", "3", "4", ""],
svec!["testdir\\in3", "1", "2", "3", "4"],
svec!["testdir\\in3", "2", "3", "4", "5"],
svec!["testdir\\in3", "z", "y", "x", "w"],
];
#[cfg(not(windows))]
let expected = vec![
svec!["file", "a", "b", "c", "d"],
svec!["in1", "1", "2", "3", ""],
svec!["in1", "2", "3", "4", ""],
svec!["in2", "1", "2", "3", ""],
svec!["in2", "2", "3", "4", ""],
svec!["testdir/in3", "1", "2", "3", "4"],
svec!["testdir/in3", "2", "3", "4", "5"],
svec!["testdir/in3", "z", "y", "x", "w"],
];
assert_eq!(got, expected);
}

#[test]
fn cat_rowskey_grouping_infile() {
let wrk = Workdir::new("cat_rowskey_grouping_infile");
Expand Down Expand Up @@ -310,7 +449,7 @@ fn cat_rowskey_grouping_infile() {

let mut cmd = wrk.command("cat");
cmd.arg("rowskey")
.arg("--group")
.args(["-g", "FStem"])
.arg("testdata.infile-list");

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
Expand Down Expand Up @@ -360,7 +499,7 @@ fn cat_rowskey_grouping_customname() {

let mut cmd = wrk.command("cat");
cmd.arg("rowskey")
.arg("--group")
.args(["--group", "fstem"])
.args(&["--group-name", "file group label"])
.arg("in1.csv")
.arg("in2.csv")
Expand Down
7 changes: 7 additions & 0 deletions tests/workdir.rs
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,13 @@ impl Workdir {
}
Ok(())
}

// create a subdirectory
pub fn create_subdir(&self, name: &str) -> io::Result<()> {
let mut path = self.dir.clone();
path.push(name);
create_dir_all(path)
}
}

impl fmt::Debug for Workdir {
Expand Down

0 comments on commit 4fbd53c

Please sign in to comment.