From fc3b523abfa9f244eb9490d9412887c7d86f4da7 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Fri, 5 Jul 2024 19:30:40 -0400 Subject: [PATCH 1/4] `deps`: bump to latest polars upstream with addl fixes/features; specifically right join support --- Cargo.lock | 36 ++++++++++++++++++------------------ Cargo.toml | 2 +- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index fa5508256..ddb9bd771 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4028,7 +4028,7 @@ dependencies = [ [[package]] name = "polars" version = "0.41.3" -source = "git+https://github.com/pola-rs/polars?rev=276655a#276655a3b00f63fd92f57c3aa7d8aa1ea35f1c8b" +source = "git+https://github.com/pola-rs/polars?rev=34126ca#34126cac12d5a69de32f7b6ad017303b92043ba9" dependencies = [ "getrandom", "polars-arrow", @@ -4047,7 +4047,7 @@ dependencies = [ [[package]] name = "polars-arrow" version = "0.41.3" -source = "git+https://github.com/pola-rs/polars?rev=276655a#276655a3b00f63fd92f57c3aa7d8aa1ea35f1c8b" +source = "git+https://github.com/pola-rs/polars?rev=34126ca#34126cac12d5a69de32f7b6ad017303b92043ba9" dependencies = [ "ahash 0.8.11", "atoi", @@ -4094,7 +4094,7 @@ dependencies = [ [[package]] name = "polars-compute" version = "0.41.3" -source = "git+https://github.com/pola-rs/polars?rev=276655a#276655a3b00f63fd92f57c3aa7d8aa1ea35f1c8b" +source = "git+https://github.com/pola-rs/polars?rev=34126ca#34126cac12d5a69de32f7b6ad017303b92043ba9" dependencies = [ "bytemuck", "either", @@ -4109,7 +4109,7 @@ dependencies = [ [[package]] name = "polars-core" version = "0.41.3" -source = "git+https://github.com/pola-rs/polars?rev=276655a#276655a3b00f63fd92f57c3aa7d8aa1ea35f1c8b" +source = "git+https://github.com/pola-rs/polars?rev=34126ca#34126cac12d5a69de32f7b6ad017303b92043ba9" dependencies = [ "ahash 0.8.11", "bitflags 2.6.0", @@ -4142,7 +4142,7 @@ dependencies = [ [[package]] name = "polars-error" version = "0.41.3" -source = "git+https://github.com/pola-rs/polars?rev=276655a#276655a3b00f63fd92f57c3aa7d8aa1ea35f1c8b" +source = "git+https://github.com/pola-rs/polars?rev=34126ca#34126cac12d5a69de32f7b6ad017303b92043ba9" dependencies = [ "avro-schema", "polars-arrow-format", @@ -4154,7 +4154,7 @@ dependencies = [ [[package]] name = "polars-expr" version = "0.41.3" -source = "git+https://github.com/pola-rs/polars?rev=276655a#276655a3b00f63fd92f57c3aa7d8aa1ea35f1c8b" +source = "git+https://github.com/pola-rs/polars?rev=34126ca#34126cac12d5a69de32f7b6ad017303b92043ba9" dependencies = [ "ahash 0.8.11", "bitflags 2.6.0", @@ -4173,7 +4173,7 @@ dependencies = [ [[package]] name = "polars-io" version = "0.41.3" -source = "git+https://github.com/pola-rs/polars?rev=276655a#276655a3b00f63fd92f57c3aa7d8aa1ea35f1c8b" +source = "git+https://github.com/pola-rs/polars?rev=34126ca#34126cac12d5a69de32f7b6ad017303b92043ba9" dependencies = [ "ahash 0.8.11", "async-trait", @@ -4212,7 +4212,7 @@ dependencies = [ [[package]] name = "polars-json" version = "0.41.3" -source = "git+https://github.com/pola-rs/polars?rev=276655a#276655a3b00f63fd92f57c3aa7d8aa1ea35f1c8b" +source = "git+https://github.com/pola-rs/polars?rev=34126ca#34126cac12d5a69de32f7b6ad017303b92043ba9" dependencies = [ "ahash 0.8.11", "chrono", @@ -4233,7 +4233,7 @@ dependencies = [ [[package]] name = "polars-lazy" version = "0.41.3" -source = "git+https://github.com/pola-rs/polars?rev=276655a#276655a3b00f63fd92f57c3aa7d8aa1ea35f1c8b" +source = "git+https://github.com/pola-rs/polars?rev=34126ca#34126cac12d5a69de32f7b6ad017303b92043ba9" dependencies = [ "ahash 0.8.11", "bitflags 2.6.0", @@ -4259,7 +4259,7 @@ dependencies = [ [[package]] name = "polars-mem-engine" version = "0.41.3" -source = "git+https://github.com/pola-rs/polars?rev=276655a#276655a3b00f63fd92f57c3aa7d8aa1ea35f1c8b" +source = "git+https://github.com/pola-rs/polars?rev=34126ca#34126cac12d5a69de32f7b6ad017303b92043ba9" dependencies = [ "polars-arrow", "polars-core", @@ -4277,7 +4277,7 @@ dependencies = [ [[package]] name = "polars-ops" version = "0.41.3" -source = "git+https://github.com/pola-rs/polars?rev=276655a#276655a3b00f63fd92f57c3aa7d8aa1ea35f1c8b" +source = "git+https://github.com/pola-rs/polars?rev=34126ca#34126cac12d5a69de32f7b6ad017303b92043ba9" dependencies = [ "ahash 0.8.11", "argminmax", @@ -4310,7 +4310,7 @@ dependencies = [ [[package]] name = "polars-parquet" version = "0.41.3" -source = "git+https://github.com/pola-rs/polars?rev=276655a#276655a3b00f63fd92f57c3aa7d8aa1ea35f1c8b" +source = "git+https://github.com/pola-rs/polars?rev=34126ca#34126cac12d5a69de32f7b6ad017303b92043ba9" dependencies = [ "ahash 0.8.11", "async-stream", @@ -4336,7 +4336,7 @@ dependencies = [ [[package]] name = "polars-pipe" version = "0.41.3" -source = "git+https://github.com/pola-rs/polars?rev=276655a#276655a3b00f63fd92f57c3aa7d8aa1ea35f1c8b" +source = "git+https://github.com/pola-rs/polars?rev=34126ca#34126cac12d5a69de32f7b6ad017303b92043ba9" dependencies = [ "crossbeam-channel", "crossbeam-queue", @@ -4361,7 +4361,7 @@ dependencies = [ [[package]] name = "polars-plan" version = "0.41.3" -source = "git+https://github.com/pola-rs/polars?rev=276655a#276655a3b00f63fd92f57c3aa7d8aa1ea35f1c8b" +source = "git+https://github.com/pola-rs/polars?rev=34126ca#34126cac12d5a69de32f7b6ad017303b92043ba9" dependencies = [ "ahash 0.8.11", "bytemuck", @@ -4390,7 +4390,7 @@ dependencies = [ [[package]] name = "polars-row" version = "0.41.3" -source = "git+https://github.com/pola-rs/polars?rev=276655a#276655a3b00f63fd92f57c3aa7d8aa1ea35f1c8b" +source = "git+https://github.com/pola-rs/polars?rev=34126ca#34126cac12d5a69de32f7b6ad017303b92043ba9" dependencies = [ "bytemuck", "polars-arrow", @@ -4401,7 +4401,7 @@ dependencies = [ [[package]] name = "polars-sql" version = "0.41.3" -source = "git+https://github.com/pola-rs/polars?rev=276655a#276655a3b00f63fd92f57c3aa7d8aa1ea35f1c8b" +source = "git+https://github.com/pola-rs/polars?rev=34126ca#34126cac12d5a69de32f7b6ad017303b92043ba9" dependencies = [ "hex", "once_cell", @@ -4421,7 +4421,7 @@ dependencies = [ [[package]] name = "polars-time" version = "0.41.3" -source = "git+https://github.com/pola-rs/polars?rev=276655a#276655a3b00f63fd92f57c3aa7d8aa1ea35f1c8b" +source = "git+https://github.com/pola-rs/polars?rev=34126ca#34126cac12d5a69de32f7b6ad017303b92043ba9" dependencies = [ "atoi", "bytemuck", @@ -4442,7 +4442,7 @@ dependencies = [ [[package]] name = "polars-utils" version = "0.41.3" -source = "git+https://github.com/pola-rs/polars?rev=276655a#276655a3b00f63fd92f57c3aa7d8aa1ea35f1c8b" +source = "git+https://github.com/pola-rs/polars?rev=34126ca#34126cac12d5a69de32f7b6ad017303b92043ba9" dependencies = [ "ahash 0.8.11", "bytemuck", diff --git a/Cargo.toml b/Cargo.toml index 937bbdda3..67c060094 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -260,7 +260,7 @@ calamine = { git = "https://github.com/tafia/calamine", rev = "6b41309" } # use modernized version of local_encoding local-encoding = { git = "https://github.com/slonopotamus/local-encoding-rs", branch = "travis-madness" } # use latest upstream version of polars with additional unreleased features/fixes -polars = { git = "https://github.com/pola-rs/polars", rev = "276655a" } +polars = { git = "https://github.com/pola-rs/polars", rev = "34126ca" } [features] From 2e0c7eab75e343cb1ccfcc861ea4442094169610 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Fri, 5 Jul 2024 19:30:53 -0400 Subject: [PATCH 2/4] `joinp`: add right join support --- src/cmd/joinp.rs | 38 ++++++++++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/src/cmd/joinp.rs b/src/cmd/joinp.rs index c30168f21..803626683 100644 --- a/src/cmd/joinp.rs +++ b/src/cmd/joinp.rs @@ -8,7 +8,7 @@ Unlike the join command, joinp can process files larger than RAM, is multithread has join key validation, pre-join filtering, supports asof joins & its output columns can be coalesced (no duplicate columns). -However, joinp doesn't have an --ignore-case option & it doesn't support right outer joins. +However, joinp doesn't have an --ignore-case option. Returns the shape of the join result (number of rows, number of columns) to stderr. @@ -41,6 +41,11 @@ joinp options: --left-semi This returns only the rows in the first CSV data set that have a corresponding row in the second data set. The output schema is the same as the first data set. + --right Do a 'right outer' join. This returns all rows in + second CSV data set, including rows with no + corresponding row in the first data set. When no + corresponding row exists, it is padded out with + empty fields. (This is the reverse of 'outer left'.) --full Do a 'full outer' join. This returns all rows in both data sets with matching records joined. If there is no match, the missing side will be padded @@ -200,6 +205,7 @@ struct Args { flag_left: bool, flag_left_anti: bool, flag_left_semi: bool, + flag_right: bool, flag_full: bool, flag_cross: bool, flag_coalesce: bool, @@ -263,17 +269,33 @@ pub fn run(argv: &[&str]) -> CliResult<()> { args.flag_left, args.flag_left_anti, args.flag_left_semi, + args.flag_right, args.flag_full, args.flag_cross, args.flag_asof, ) { - (false, false, false, false, false, false) => join.run(JoinType::Inner, validation, false), - (true, false, false, false, false, false) => join.run(JoinType::Left, validation, false), - (false, true, false, false, false, false) => join.run(JoinType::Anti, validation, false), - (false, false, true, false, false, false) => join.run(JoinType::Semi, validation, false), - (false, false, false, true, false, false) => join.run(JoinType::Full, validation, false), - (false, false, false, false, true, false) => join.run(JoinType::Cross, validation, false), - (false, false, false, false, false, true) => { + (false, false, false, false, false, false, false) => { + join.run(JoinType::Inner, validation, false) + }, + (true, false, false, false, false, false, false) => { + join.run(JoinType::Left, validation, false) + }, + (false, true, false, false, false, false, false) => { + join.run(JoinType::Anti, validation, false) + }, + (false, false, false, true, false, false, false) => { + join.run(JoinType::Right, validation, false) + }, + (false, false, true, false, false, false, false) => { + join.run(JoinType::Semi, validation, false) + }, + (false, false, false, false, true, false, false) => { + join.run(JoinType::Full, validation, false) + }, + (false, false, false, false, false, true, false) => { + join.run(JoinType::Cross, validation, false) + }, + (false, false, false, false, false, false, true) => { // safety: flag_strategy is always is_some() as it has a default value args.flag_strategy = Some(args.flag_strategy.unwrap().to_lowercase()); let strategy = match args.flag_strategy.as_deref() { From 0f5cb7eccefff9610fbf246a416f610e2e844cc8 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Fri, 5 Jul 2024 19:31:07 -0400 Subject: [PATCH 3/4] `tests`: add `joinp --right` test --- tests/test_joinp.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tests/test_joinp.rs b/tests/test_joinp.rs index b64a99e43..c081f7ad3 100644 --- a/tests/test_joinp.rs +++ b/tests/test_joinp.rs @@ -371,6 +371,24 @@ joinp_test!( } ); +joinp_test!( + joinp_outer_right_none_streaming, + |wrk: Workdir, mut cmd: process::Command| { + cmd.arg("--right") + .args(["--validate", "none"]) + .arg("--streaming"); + let got: Vec> = wrk.read_stdout(&mut cmd); + let expected: Vec> = vec![ + svec!["state", "city", "place"], + svec!["MA", "Boston", "Logan Airport"], + svec!["MA", "Boston", "Boston Garden"], + svec!["NY", "Buffalo", "Ralph Wilson Stadium"], + svec!["", "Orlando", "Disney World"], + ]; + assert_eq!(got, expected); + } +); + joinp_test_comments!( joinp_outer_left_validate_none_comments, |wrk: Workdir, mut cmd: process::Command| { From a6e075d946e75c1bf148cdf45bbfd2d0c008b20d Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Fri, 5 Jul 2024 19:33:00 -0400 Subject: [PATCH 4/4] `docs`: update `joinp` description --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 685abc5ec..04c693cd0 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,7 @@ | [index](/src/cmd/index.rs#L2) | Create an index (📇) for a CSV. This is very quick (even the 15gb, 28m row NYC 311 dataset takes all of 14 seconds to index) & provides constant time indexing/random access into the CSV. With an index, `count`, `sample` & `slice` work instantaneously; random access mode is enabled in `luau`; and multithreading (🏎️) is enabled for the `frequency`, `split`, `stats`, `schema` & `tojsonl` commands. | | [input](/src/cmd/input.rs#L2) | Read CSV data with special commenting, quoting, trimming, line-skipping & non-UTF8 encoding handling rules. Typically used to "normalize" a CSV for further processing with other qsv commands. | | [join](/src/cmd/join.rs#L2)
👆 | Inner, outer, right, cross, anti & semi joins. Automatically creates a simple, in-memory hash index to make it fast. | -| [joinp](/src/cmd/joinp.rs#L2)
✨🚀🐻‍❄️ | Inner, outer, cross, anti, semi & asof joins using the [Pola.rs](https://www.pola.rs) engine. Unlike the `join` command, `joinp` can process files larger than RAM, is multithreaded, has join key validation, pre-join filtering, supports [asof joins](https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.join_asof.html) (which is [particularly useful for time series data](https://github.com/jqnatividad/qsv/blob/30cc920d0812a854fcbfedc5db81788a0600c92b/tests/test_joinp.rs#L509-L983)) & its output doesn't have duplicate columns. However, `joinp` doesn't have an --ignore-case option & it doesn't support right outer joins. | +| [joinp](/src/cmd/joinp.rs#L2)
✨🚀🐻‍❄️ | Inner, outer, right, cross, anti, semi & asof joins using the [Pola.rs](https://www.pola.rs) engine. Unlike the `join` command, `joinp` can process files larger than RAM, is multithreaded, has join key validation, pre-join filtering, supports [asof joins](https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.join_asof.html) (which is [particularly useful for time series data](https://github.com/jqnatividad/qsv/blob/30cc920d0812a854fcbfedc5db81788a0600c92b/tests/test_joinp.rs#L509-L983)) & its output columns can be coalesced. However, `joinp` doesn't have an --ignore-case option. | | [jsonl](/src/cmd/jsonl.rs#L2)
🚀🔣 | Convert newline-delimited JSON ([JSONL](https://jsonlines.org/)/[NDJSON](http://ndjson.org/)) to CSV. See `tojsonl` command to convert CSV to JSONL. | [json](/src/cmd/json.rs#L2)
| Convert non-nested JSON to CSV. |
[luau](/src/cmd/luau.rs#L2) 👑
✨📇🌐🔣 ![CKAN](docs/images/ckan.png) | Create multiple new computed columns, filter rows, compute aggregations and build complex data pipelines by executing a [Luau](https://luau-lang.org) [0.630](https://github.com/Roblox/luau/releases/tag/0.630) expression/script for every row of a CSV file ([sequential mode](https://github.com/jqnatividad/qsv/blob/bb72c4ef369d192d85d8b7cc6e972c1b7df77635/tests/test_luau.rs#L254-L298)), or using [random access](https://www.webopedia.com/definitions/random-access/) with an index ([random access mode](https://github.com/jqnatividad/qsv/blob/bb72c4ef369d192d85d8b7cc6e972c1b7df77635/tests/test_luau.rs#L367-L415)).
Can process a single Luau expression or [full-fledged data-wrangling scripts using lookup tables](https://github.com/dathere/qsv-lookup-tables#example) with discrete BEGIN, MAIN and END sections.
It is not just another qsv command, it is qsv's [Domain-specific Language](https://en.wikipedia.org/wiki/Domain-specific_language) (DSL) with [numerous qsv-specific helper functions](https://github.com/jqnatividad/qsv/blob/113eee17b97882dc368b2e65fec52b86df09f78b/src/cmd/luau.rs#L1356-L2290) to build production data pipelines. |