Merge branch 'main' into feature/string_to_array

# Conflicts: # datafusion/sqllogictest/test_files/string/string_view.slt
Omega359 · Nov 16, 2024 · 98a2828 · 98a2828
2 parents a8ed675 + 6d8313e
commit 98a2828
Show file tree

Hide file tree

Showing 78 changed files with 2,007 additions and 873 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -145,7 +145,7 @@ recursive = "0.1.1"
 regex = "1.8"
 rstest = "0.23.0"
 serde_json = "1"
-sqlparser = { version = "0.51.0", features = ["visitor"] }
+sqlparser = { version = "0.52.0", features = ["visitor"] }
 tempfile = "3"
 tokio = { version = "1.36", features = ["macros", "rt", "sync"] }
 url = "2.2"

diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -330,6 +330,30 @@ steps.
 The tests sort the entire dataset using several different sort
 orders.
 
+## Sort TPCH
+
+Test performance of end-to-end sort SQL queries. (While the `Sort` benchmark focuses on a single sort executor, this benchmark tests how sorting is executed across multiple CPU cores by benchmarking sorting the whole relational table.)
+
+Sort integration benchmark runs whole table sort queries on TPCH `lineitem` table, with different characteristics. For example, different number of sort keys, different sort key cardinality, different number of payload columns, etc.
+
+See [`sort_tpch.rs`](src/sort_tpch.rs) for more details.
+
+### Sort TPCH Benchmark Example Runs
+1. Run all queries with default setting:
+```bash
+ cargo run --release --bin dfbench -- sort-tpch -p '....../datafusion/benchmarks/data/tpch_sf1' -o '/tmp/sort_tpch.json'
+```
+
+2. Run a specific query:
+```bash
+ cargo run --release --bin dfbench -- sort-tpch -p '....../datafusion/benchmarks/data/tpch_sf1' -o '/tmp/sort_tpch.json' --query 2
+```
+
+3. Run all queries with `bench.sh` script:
+```bash
+./bench.sh run sort_tpch
+```
+
 ## IMDB
 
 Run Join Order Benchmark (JOB) on IMDB dataset.

diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh
@@ -75,6 +75,7 @@ tpch10:                 TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB),
 tpch_mem10:             TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB), query from memory
 parquet:                Benchmark of parquet reader's filtering speed
 sort:                   Benchmark of sorting speed
+sort_tpch:              Benchmark of sorting speed for end-to-end sort queries on TPCH dataset
 clickbench_1:           ClickBench queries against a single parquet file
 clickbench_partitioned: ClickBench queries against a partitioned (100 files) parquet
 clickbench_extended:    ClickBench \"inspired\" queries against a single parquet (DataFusion specific)
@@ -175,6 +176,10 @@ main() {
                     # same data as for tpch
                     data_tpch "1"
                     ;;
+                sort_tpch)
+                    # same data as for tpch
+                    data_tpch "1"
+                    ;;
                 *)
                     echo "Error: unknown benchmark '$BENCHMARK' for data generation"
                     usage
@@ -252,6 +257,9 @@ main() {
                 external_aggr)
                     run_external_aggr
                     ;;
+                sort_tpch)
+                    run_sort_tpch
+                    ;;
                 *)
                     echo "Error: unknown benchmark '$BENCHMARK' for run"
                     usage
@@ -549,6 +557,16 @@ run_external_aggr() {
     $CARGO_COMMAND --bin external_aggr -- benchmark --partitions 4 --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}"
 }
 
+# Runs the sort integration benchmark
+run_sort_tpch() {
+    TPCH_DIR="${DATA_DIR}/tpch_sf1"
+    RESULTS_FILE="${RESULTS_DIR}/sort_tpch.json"
+    echo "RESULTS_FILE: ${RESULTS_FILE}"
+    echo "Running sort tpch benchmark..."
+
+    $CARGO_COMMAND --bin dfbench -- sort-tpch --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}"
+}
+
 
 compare_benchmarks() {
     BASE_RESULTS_DIR="${SCRIPT_DIR}/results"

diff --git a/benchmarks/src/bin/dfbench.rs b/benchmarks/src/bin/dfbench.rs
@@ -33,7 +33,7 @@ static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc;
 #[global_allocator]
 static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
 
-use datafusion_benchmarks::{clickbench, imdb, parquet_filter, sort, tpch};
+use datafusion_benchmarks::{clickbench, imdb, parquet_filter, sort, sort_tpch, tpch};
 
 #[derive(Debug, StructOpt)]
 #[structopt(about = "benchmark command")]
@@ -43,6 +43,7 @@ enum Options {
     Clickbench(clickbench::RunOpt),
     ParquetFilter(parquet_filter::RunOpt),
     Sort(sort::RunOpt),
+    SortTpch(sort_tpch::RunOpt),
     Imdb(imdb::RunOpt),
 }
 
@@ -57,6 +58,7 @@ pub async fn main() -> Result<()> {
         Options::Clickbench(opt) => opt.run().await,
         Options::ParquetFilter(opt) => opt.run().await,
         Options::Sort(opt) => opt.run().await,
+        Options::SortTpch(opt) => opt.run().await,
         Options::Imdb(opt) => opt.run().await,
     }
 }
diff --git a/benchmarks/src/lib.rs b/benchmarks/src/lib.rs
@@ -20,5 +20,6 @@ pub mod clickbench;
 pub mod imdb;
 pub mod parquet_filter;
 pub mod sort;
+pub mod sort_tpch;
 pub mod tpch;
 pub mod util;