Skip to content

Commit

Permalink
Merge pull request #2287 from jqnatividad/joinp-ignore-case_option
Browse files Browse the repository at this point in the history
`joinp`: add `--ignore-case` option
  • Loading branch information
jqnatividad authored Nov 13, 2024
2 parents 33fa54a + a719360 commit c08e621
Show file tree
Hide file tree
Showing 3 changed files with 117 additions and 3 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
| [index](/src/cmd/index.rs#L2) | Create an index (📇) for a CSV. This is very quick (even the 15gb, 28m row NYC 311 dataset takes all of 14 seconds to index) & provides constant time indexing/random access into the CSV. With an index, `count`, `sample` & `slice` work instantaneously; random access mode is enabled in `luau`; and multithreading (🏎️) is enabled for the `frequency`, `split`, `stats`, `schema` & `tojsonl` commands. |
| [input](/src/cmd/input.rs#L2) | Read CSV data with special commenting, quoting, trimming, line-skipping & non-UTF8 encoding handling rules. Typically used to "normalize" a CSV for further processing with other qsv commands. |
| [join](/src/cmd/join.rs#L2)<br>👆 | Inner, outer, right, cross, anti & semi joins. Automatically creates a simple, in-memory hash index to make it fast. |
| [joinp](/src/cmd/joinp.rs#L2)<br>✨🚀🐻‍❄️🪄 | Inner, outer, right, cross, anti, semi & asof joins using the [Pola.rs](https://www.pola.rs) engine. Unlike the `join` command, `joinp` can process files larger than RAM, is multithreaded, has join key validation, pre-join filtering, supports [asof joins](https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.join_asof.html) (which is [particularly useful for time series data](https://github.com/jqnatividad/qsv/blob/30cc920d0812a854fcbfedc5db81788a0600c92b/tests/test_joinp.rs#L509-L983)) & its output columns can be coalesced. However, `joinp` doesn't have an --ignore-case option. |
| [joinp](/src/cmd/joinp.rs#L2)<br>✨🚀🐻‍❄️🪄 | Inner, outer, right, cross, anti, semi & asof joins using the [Pola.rs](https://www.pola.rs) engine. Unlike the `join` command, `joinp` can process files larger than RAM, is multithreaded, has join key validation, pre-join filtering, supports [asof joins](https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.join_asof.html) (which is [particularly useful for time series data](https://github.com/jqnatividad/qsv/blob/30cc920d0812a854fcbfedc5db81788a0600c92b/tests/test_joinp.rs#L509-L983)) & its output columns can be coalesced. |
| [json](/src/cmd/json.rs#L2)<br>👆 | Convert JSON to CSV.
| [jsonl](/src/cmd/jsonl.rs#L2)<br>🚀🔣 | Convert newline-delimited JSON ([JSONL](https://jsonlines.org/)/[NDJSON](http://ndjson.org/)) to CSV. See `tojsonl` command to convert CSV to JSONL.
| [lens](/src/cmd/lens.rs#L2) | Interactively view, search & filter a CSV using the [csvlens](https://github.com/YS-L/csvlens#csvlens) engine.
Expand Down
75 changes: 73 additions & 2 deletions src/cmd/joinp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ joinp arguments:
Note that <input1> is the left CSV data set and <input2> is the right CSV data set.
joinp options:
-i, --ignore-case When set, joins are done case insensitively.
--left Do a 'left outer' join. This returns all rows in
first CSV data set, including rows with no
corresponding row in the second data set. When no
Expand Down Expand Up @@ -250,6 +251,7 @@ struct Args {
flag_output: Option<String>,
flag_delimiter: Option<Delimiter>,
flag_quiet: bool,
flag_ignore_case: bool,
}

pub fn run(argv: &[&str]) -> CliResult<()> {
Expand Down Expand Up @@ -384,6 +386,7 @@ struct JoinStruct {
time_format: Option<String>,
float_precision: Option<usize>,
null_value: String,
ignore_case: bool,
}

impl JoinStruct {
Expand All @@ -393,17 +396,70 @@ impl JoinStruct {
validation: JoinValidation,
asof_join: bool,
) -> CliResult<(usize, usize)> {
let left_selcols: Vec<_> = self
let mut left_selcols: Vec<_> = self
.left_sel
.split(',')
.map(polars::lazy::dsl::col)
.collect();
let right_selcols: Vec<_> = self
let mut right_selcols: Vec<_> = self
.right_sel
.split(',')
.map(polars::lazy::dsl::col)
.collect();

// If ignore_case is enabled, create lowercase versions of the join columns
if self.ignore_case {
// Create temporary lowercase versions of join columns in left dataframe
for col in &left_selcols {
self.left_lf = self
.left_lf
.with_column(col.clone().str().to_lowercase().alias(&format!(
"_qsv-{}-lower",
col.to_string()
.trim_start_matches(r#"col(""#)
.trim_end_matches(r#"")"#)
)));
}

// Create temporary lowercase versions of join columns in right dataframe
for col in &right_selcols {
self.right_lf = self
.right_lf
.with_column(col.clone().str().to_lowercase().alias(&format!(
"_qsv-{}-lower",
col.to_string()
.trim_start_matches(r#"col(""#)
.trim_end_matches(r#"")"#)
)));
}

// Create new vectors for the lowercase column names
let left_selcols_w: Vec<_> = left_selcols
.iter()
.map(|col| {
polars::lazy::dsl::col(&format!(
"_qsv-{}-lower",
col.to_string()
.trim_start_matches(r#"col(""#)
.trim_end_matches(r#"")"#)
))
})
.collect();
left_selcols = left_selcols_w;
let right_selcols_w: Vec<_> = right_selcols
.iter()
.map(|col| {
polars::lazy::dsl::col(&format!(
"_qsv-{}-lower",
col.to_string()
.trim_start_matches(r#"col(""#)
.trim_end_matches(r#"")"#)
))
})
.collect();
right_selcols = right_selcols_w;
}

let left_selcols_len = left_selcols.len();
let right_selcols_len = right_selcols.len();

Expand Down Expand Up @@ -497,6 +553,20 @@ impl JoinStruct {
join_results
};

// if self.ignore_case, remove the temporary lowercase columns from the dataframe
if self.ignore_case {
// Get all column names
let cols = results_df.get_column_names();
// Filter out the lowercase columns (those with "_qsv-*-lower" pattern)
let keep_cols: Vec<String> = cols
.iter()
.filter(|&col| !(col.starts_with("_qsv-") && col.ends_with("-lower")))
.map(|&s| s.to_string())
.collect();
// Select only the non-lowercase columns
results_df = results_df.select(keep_cols)?;
}

let mut out_delim = self.delim;
let mut out_writer = match self.output {
Some(ref output_file) => {
Expand Down Expand Up @@ -780,6 +850,7 @@ impl Args {
} else {
self.flag_null_value.clone()
},
ignore_case: self.flag_ignore_case,
})
}
}
Expand Down
43 changes: 43 additions & 0 deletions tests/test_joinp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1244,3 +1244,46 @@ fn joinp_asof_date_diffcolnames_sqlfilter() {
];
assert_eq!(got, expected);
}

// ... existing code ...

#[test]
fn joinp_ignore_case() {
let wrk = Workdir::new("joinp_ignore_case");

// Create test data with mixed case cities
wrk.create(
"cities_mixed.csv",
vec![
svec!["city", "state"],
svec!["BOSTON", "MA"],
svec!["new york", "NY"],
svec!["San Francisco", "CA"],
svec!["BUFFALO", "NY"],
],
);

wrk.create(
"places_mixed.csv",
vec![
svec!["city", "place"],
svec!["Boston", "Logan Airport"],
svec!["boston", "Boston Garden"],
svec!["BUFFALO", "Ralph Wilson Stadium"],
svec!["orlando", "Disney World"],
],
);

let mut cmd = wrk.command("joinp");
cmd.args(&["city", "cities_mixed.csv", "city", "places_mixed.csv"])
.arg("--ignore-case");

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = vec![
svec!["city", "state", "city_right", "place"],
svec!["BOSTON", "MA", "Boston", "Logan Airport"],
svec!["BOSTON", "MA", "boston", "Boston Garden"],
svec!["BUFFALO", "NY", "BUFFALO", "Ralph Wilson Stadium"],
];
assert_eq!(got, expected);
}

0 comments on commit c08e621

Please sign in to comment.