Merge pull request #2287 from jqnatividad/joinp-ignore-case_option

`joinp`: add `--ignore-case` option
dathere · Nov 13, 2024 · c08e621 · c08e621
2 parents 33fa54a + a719360
commit c08e621
Show file tree

Hide file tree

Showing 3 changed files with 117 additions and 3 deletions.
diff --git a/README.md b/README.md
@@ -56,7 +56,7 @@
 | [index](/src/cmd/index.rs#L2) | Create an index (📇) for a CSV. This is very quick (even the 15gb, 28m row NYC 311 dataset takes all of 14 seconds to index) & provides constant time indexing/random access into the CSV. With an index, `count`, `sample` & `slice` work instantaneously; random access mode is enabled in `luau`; and multithreading (🏎️) is enabled for the `frequency`, `split`, `stats`, `schema` & `tojsonl` commands. |
 | [input](/src/cmd/input.rs#L2) | Read CSV data with special commenting, quoting, trimming, line-skipping & non-UTF8 encoding handling rules. Typically used to "normalize" a CSV for further processing with other qsv commands. |
 | [join](/src/cmd/join.rs#L2)<br>👆 | Inner, outer, right, cross, anti & semi joins. Automatically creates a simple, in-memory hash index to make it fast.  |
-| [joinp](/src/cmd/joinp.rs#L2)<br>✨🚀🐻‍❄️🪄 | Inner, outer, right, cross, anti, semi & asof joins using the [Pola.rs](https://www.pola.rs) engine. Unlike the `join` command, `joinp` can process files larger than RAM, is multithreaded, has join key validation, pre-join filtering, supports [asof joins](https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.join_asof.html) (which is [particularly useful for time series data](https://github.com/jqnatividad/qsv/blob/30cc920d0812a854fcbfedc5db81788a0600c92b/tests/test_joinp.rs#L509-L983)) & its output columns can be coalesced. However, `joinp` doesn't have an --ignore-case option. |
+| [joinp](/src/cmd/joinp.rs#L2)<br>✨🚀🐻‍❄️🪄 | Inner, outer, right, cross, anti, semi & asof joins using the [Pola.rs](https://www.pola.rs) engine. Unlike the `join` command, `joinp` can process files larger than RAM, is multithreaded, has join key validation, pre-join filtering, supports [asof joins](https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.join_asof.html) (which is [particularly useful for time series data](https://github.com/jqnatividad/qsv/blob/30cc920d0812a854fcbfedc5db81788a0600c92b/tests/test_joinp.rs#L509-L983)) & its output columns can be coalesced. |
 | [json](/src/cmd/json.rs#L2)<br>👆 | Convert JSON to CSV.
 | [jsonl](/src/cmd/jsonl.rs#L2)<br>🚀🔣 | Convert newline-delimited JSON ([JSONL](https://jsonlines.org/)/[NDJSON](http://ndjson.org/)) to CSV. See `tojsonl` command to convert CSV to JSONL.
 | [lens](/src/cmd/lens.rs#L2) | Interactively view, search & filter a CSV using the [csvlens](https://github.com/YS-L/csvlens#csvlens) engine.

diff --git a/src/cmd/joinp.rs b/src/cmd/joinp.rs
@@ -29,6 +29,7 @@ joinp arguments:
     Note that <input1> is the left CSV data set and <input2> is the right CSV data set.
 
 joinp options:
+    -i, --ignore-case      When set, joins are done case insensitively.
     --left                 Do a 'left outer' join. This returns all rows in
                            first CSV data set, including rows with no
                            corresponding row in the second data set. When no
@@ -250,6 +251,7 @@ struct Args {
     flag_output:           Option<String>,
     flag_delimiter:        Option<Delimiter>,
     flag_quiet:            bool,
+    flag_ignore_case:      bool,
 }
 
 pub fn run(argv: &[&str]) -> CliResult<()> {
@@ -384,6 +386,7 @@ struct JoinStruct {
     time_format:      Option<String>,
     float_precision:  Option<usize>,
     null_value:       String,
+    ignore_case:      bool,
 }
 
 impl JoinStruct {
@@ -393,17 +396,70 @@ impl JoinStruct {
         validation: JoinValidation,
         asof_join: bool,
     ) -> CliResult<(usize, usize)> {
-        let left_selcols: Vec<_> = self
+        let mut left_selcols: Vec<_> = self
             .left_sel
             .split(',')
             .map(polars::lazy::dsl::col)
             .collect();
-        let right_selcols: Vec<_> = self
+        let mut right_selcols: Vec<_> = self
             .right_sel
             .split(',')
             .map(polars::lazy::dsl::col)
             .collect();
 
+        // If ignore_case is enabled, create lowercase versions of the join columns
+        if self.ignore_case {
+            // Create temporary lowercase versions of join columns in left dataframe
+            for col in &left_selcols {
+                self.left_lf = self
+                    .left_lf
+                    .with_column(col.clone().str().to_lowercase().alias(&format!(
+                        "_qsv-{}-lower",
+                        col.to_string()
+                            .trim_start_matches(r#"col(""#)
+                            .trim_end_matches(r#"")"#)
+                    )));
+            }
+
+            // Create temporary lowercase versions of join columns in right dataframe
+            for col in &right_selcols {
+                self.right_lf = self
+                    .right_lf
+                    .with_column(col.clone().str().to_lowercase().alias(&format!(
+                        "_qsv-{}-lower",
+                        col.to_string()
+                            .trim_start_matches(r#"col(""#)
+                            .trim_end_matches(r#"")"#)
+                    )));
+            }
+
+            // Create new vectors for the lowercase column names
+            let left_selcols_w: Vec<_> = left_selcols
+                .iter()
+                .map(|col| {
+                    polars::lazy::dsl::col(&format!(
+                        "_qsv-{}-lower",
+                        col.to_string()
+                            .trim_start_matches(r#"col(""#)
+                            .trim_end_matches(r#"")"#)
+                    ))
+                })
+                .collect();
+            left_selcols = left_selcols_w;
+            let right_selcols_w: Vec<_> = right_selcols
+                .iter()
+                .map(|col| {
+                    polars::lazy::dsl::col(&format!(
+                        "_qsv-{}-lower",
+                        col.to_string()
+                            .trim_start_matches(r#"col(""#)
+                            .trim_end_matches(r#"")"#)
+                    ))
+                })
+                .collect();
+            right_selcols = right_selcols_w;
+        }
+
         let left_selcols_len = left_selcols.len();
         let right_selcols_len = right_selcols.len();
 
@@ -497,6 +553,20 @@ impl JoinStruct {
             join_results
         };
 
+        // if self.ignore_case, remove the temporary lowercase columns from the dataframe
+        if self.ignore_case {
+            // Get all column names
+            let cols = results_df.get_column_names();
+            // Filter out the lowercase columns (those with "_qsv-*-lower" pattern)
+            let keep_cols: Vec<String> = cols
+                .iter()
+                .filter(|&col| !(col.starts_with("_qsv-") && col.ends_with("-lower")))
+                .map(|&s| s.to_string())
+                .collect();
+            // Select only the non-lowercase columns
+            results_df = results_df.select(keep_cols)?;
+        }
+
         let mut out_delim = self.delim;
         let mut out_writer = match self.output {
             Some(ref output_file) => {
@@ -780,6 +850,7 @@ impl Args {
             } else {
                 self.flag_null_value.clone()
             },
+            ignore_case: self.flag_ignore_case,
         })
     }
 }

diff --git a/tests/test_joinp.rs b/tests/test_joinp.rs
@@ -1244,3 +1244,46 @@ fn joinp_asof_date_diffcolnames_sqlfilter() {
     ];
     assert_eq!(got, expected);
 }
+
+// ... existing code ...
+
+#[test]
+fn joinp_ignore_case() {
+    let wrk = Workdir::new("joinp_ignore_case");
+
+    // Create test data with mixed case cities
+    wrk.create(
+        "cities_mixed.csv",
+        vec![
+            svec!["city", "state"],
+            svec!["BOSTON", "MA"],
+            svec!["new york", "NY"],
+            svec!["San Francisco", "CA"],
+            svec!["BUFFALO", "NY"],
+        ],
+    );
+
+    wrk.create(
+        "places_mixed.csv",
+        vec![
+            svec!["city", "place"],
+            svec!["Boston", "Logan Airport"],
+            svec!["boston", "Boston Garden"],
+            svec!["BUFFALO", "Ralph Wilson Stadium"],
+            svec!["orlando", "Disney World"],
+        ],
+    );
+
+    let mut cmd = wrk.command("joinp");
+    cmd.args(&["city", "cities_mixed.csv", "city", "places_mixed.csv"])
+        .arg("--ignore-case");
+
+    let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
+    let expected = vec![
+        svec!["city", "state", "city_right", "place"],
+        svec!["BOSTON", "MA", "Boston", "Logan Airport"],
+        svec!["BOSTON", "MA", "boston", "Boston Garden"],
+        svec!["BUFFALO", "NY", "BUFFALO", "Ralph Wilson Stadium"],
+    ];
+    assert_eq!(got, expected);
+}