Merge main + add tests

- fixes; - tests; - comments in the code;
apache · Nov 19, 2024 · 659ab7a · 659ab7a
2 parents 6e41858 + ca3a529
commit 659ab7a
Show file tree

Hide file tree

Showing 58 changed files with 2,574 additions and 411 deletions.
diff --git a/.github/actions/setup-spark-builder/action.yaml b/.github/actions/setup-spark-builder/action.yaml
@@ -29,7 +29,7 @@ inputs:
   comet-version:
     description: 'The Comet version to use for Spark'
     required: true
-    default: '0.4.0-SNAPSHOT'
+    default: '0.5.0-SNAPSHOT'
 runs:
   using: "composite"
   steps:

diff --git a/.github/workflows/spark_sql_test.yml b/.github/workflows/spark_sql_test.yml
@@ -71,7 +71,7 @@ jobs:
         with:
           spark-version: ${{ matrix.spark-version.full }}
           spark-short-version: ${{ matrix.spark-version.short }}
-          comet-version: '0.4.0-SNAPSHOT' # TODO: get this from pom.xml
+          comet-version: '0.5.0-SNAPSHOT' # TODO: get this from pom.xml
       - name: Run Spark tests
         run: |
           cd apache-spark

diff --git a/.github/workflows/spark_sql_test_ansi.yml b/.github/workflows/spark_sql_test_ansi.yml
@@ -69,7 +69,7 @@ jobs:
         with:
           spark-version: ${{ matrix.spark-version.full }}
           spark-short-version: ${{ matrix.spark-version.short }}
-          comet-version: '0.4.0-SNAPSHOT' # TODO: get this from pom.xml
+          comet-version: '0.5.0-SNAPSHOT' # TODO: get this from pom.xml
       - name: Run Spark tests
         run: |
           cd apache-spark

diff --git a/README.md b/README.md
@@ -46,7 +46,7 @@ The following chart shows the time it takes to run the 22 TPC-H queries against
 using a single executor with 8 cores. See the [Comet Benchmarking Guide](https://datafusion.apache.org/comet/contributor-guide/benchmarking.html)
 for details of the environment used for these benchmarks.
 
-When using Comet, the overall run time is reduced from 616 seconds to 374 seconds, a 1.6x speedup, with query 1
+When using Comet, the overall run time is reduced from 615 seconds to 364 seconds, a 1.7x speedup, with query 1
 running 9x faster than Spark.
 
 Running the same queries with DataFusion standalone (without Spark) using the same number of cores results in a 3.6x 
@@ -55,21 +55,21 @@ speedup compared to Spark.
 Comet is not yet achieving full DataFusion speeds in all cases, but with future work we aim to provide a 2x-4x speedup 
 for a broader set of queries.
 
-![](docs/source/_static/images/benchmark-results/0.3.0/tpch_allqueries.png)
+![](docs/source/_static/images/benchmark-results/0.4.0/tpch_allqueries.png)
 
 Here is a breakdown showing relative performance of Spark, Comet, and DataFusion for each TPC-H query.
 
-![](docs/source/_static/images/benchmark-results/0.3.0/tpch_queries_compare.png)
+![](docs/source/_static/images/benchmark-results/0.4.0/tpch_queries_compare.png)
 
 The following charts shows how much Comet currently accelerates each query from the benchmark.
 
 ### Relative speedup
 
-![](docs/source/_static/images/benchmark-results/0.3.0/tpch_queries_speedup_rel.png)
+![](docs/source/_static/images/benchmark-results/0.4.0/tpch_queries_speedup_rel.png)
 
 ### Absolute speedup
 
-![](docs/source/_static/images/benchmark-results/0.3.0/tpch_queries_speedup_abs.png)
+![](docs/source/_static/images/benchmark-results/0.4.0/tpch_queries_speedup_abs.png)
 
 These benchmarks can be reproduced in any environment using the documentation in the 
 [Comet Benchmarking Guide](https://datafusion.apache.org/comet/contributor-guide/benchmarking.html). We encourage 

diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -62,7 +62,7 @@ docker push localhost:32000/apache/datafusion-comet-tpcbench:latest
 export SPARK_MASTER=k8s://https://127.0.0.1:16443
 export COMET_DOCKER_IMAGE=localhost:32000/apache/datafusion-comet-tpcbench:latest
 # Location of Comet JAR within the Docker image
-export COMET_JAR=/opt/spark/jars/comet-spark-spark3.4_2.12-0.2.0-SNAPSHOT.jar
+export COMET_JAR=/opt/spark/jars/comet-spark-spark3.4_2.12-0.5.0-SNAPSHOT.jar
 
 $SPARK_HOME/bin/spark-submit \
     --master $SPARK_MASTER \

diff --git a/common/pom.xml b/common/pom.xml
@@ -26,7 +26,7 @@ under the License.
   <parent>
     <groupId>org.apache.datafusion</groupId>
     <artifactId>comet-parent-spark${spark.version.short}_${scala.binary.version}</artifactId>
-    <version>0.4.0-SNAPSHOT</version>
+    <version>0.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

diff --git a/common/src/main/scala/org/apache/comet/CometConf.scala b/common/src/main/scala/org/apache/comet/CometConf.scala
@@ -322,8 +322,10 @@ object CometConf extends ShimCometConf {
 
   val COMET_COLUMNAR_SHUFFLE_MEMORY_SIZE: OptionalConfigEntry[Long] =
     conf("spark.comet.columnar.shuffle.memorySize")
+      .internal()
       .doc(
-        "The optional maximum size of the memory used for Comet columnar shuffle, in MiB. " +
+        "Test-only config. This is only used to test Comet shuffle with Spark tests. " +
+          "The optional maximum size of the memory used for Comet columnar shuffle, in MiB. " +
           "Note that this config is only used when `spark.comet.exec.shuffle.mode` is " +
           "`jvm`. Once allocated memory size reaches this config, the current batch will be " +
           "flushed to disk immediately. If this is not configured, Comet will use " +
@@ -335,8 +337,10 @@ object CometConf extends ShimCometConf {
 
   val COMET_COLUMNAR_SHUFFLE_MEMORY_FACTOR: ConfigEntry[Double] =
     conf("spark.comet.columnar.shuffle.memory.factor")
+      .internal()
       .doc(
-        "Fraction of Comet memory to be allocated per executor process for Comet shuffle. " +
+        "Test-only config. This is only used to test Comet shuffle with Spark tests. " +
+          "Fraction of Comet memory to be allocated per executor process for Comet shuffle. " +
           "Comet memory size is specified by `spark.comet.memoryOverhead` or " +
           "calculated by `spark.comet.memory.overhead.factor` * `spark.executor.memory`.")
       .doubleConf
@@ -345,6 +349,17 @@ object CometConf extends ShimCometConf {
         "Ensure that Comet shuffle memory overhead factor is a double greater than 0")
       .createWithDefault(1.0)
 
+  val COMET_COLUMNAR_SHUFFLE_UNIFIED_MEMORY_ALLOCATOR_IN_TEST: ConfigEntry[Boolean] =
+    conf("spark.comet.columnar.shuffle.unifiedMemoryAllocatorTest")
+      .doc("Whether to use Spark unified memory allocator for Comet columnar shuffle in tests." +
+        "If not configured, Comet will use a test-only memory allocator for Comet columnar " +
+        "shuffle when Spark test env detected. The test-ony allocator is proposed to run with " +
+        "Spark tests as these tests require on-heap memory configuration. " +
+        "By default, this config is false.")
+      .internal()
+      .booleanConf
+      .createWithDefault(false)
+
   val COMET_COLUMNAR_SHUFFLE_BATCH_SIZE: ConfigEntry[Int] =
     conf("spark.comet.columnar.shuffle.batch.size")
       .internal()

diff --git a/dev/changelog/0.4.0.md b/dev/changelog/0.4.0.md
@@ -0,0 +1,108 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# DataFusion Comet 0.4.0 Changelog
+
+This release consists of 51 commits from 10 contributors. See credits at the end of this changelog for more information.
+
+**Fixed bugs:**
+
+- fix: Use the number of rows from underlying arrays instead of logical row count from RecordBatch [#972](https://github.com/apache/datafusion-comet/pull/972) (viirya)
+- fix: The spilled_bytes metric of CometSortExec should be size instead of time [#984](https://github.com/apache/datafusion-comet/pull/984) (Kontinuation)
+- fix: Properly handle Java exceptions without error messages; fix loading of comet native library from java.library.path [#982](https://github.com/apache/datafusion-comet/pull/982) (Kontinuation)
+- fix: Fallback to Spark if scan has meta columns [#997](https://github.com/apache/datafusion-comet/pull/997) (viirya)
+- fix: Fallback to Spark if named_struct contains duplicate field names [#1016](https://github.com/apache/datafusion-comet/pull/1016) (viirya)
+- fix: Make comet-git-info.properties optional [#1027](https://github.com/apache/datafusion-comet/pull/1027) (andygrove)
+- fix: TopK operator should return correct results on dictionary column with nulls [#1033](https://github.com/apache/datafusion-comet/pull/1033) (viirya)
+- fix: need default value for getSizeAsMb(EXECUTOR_MEMORY.key) [#1046](https://github.com/apache/datafusion-comet/pull/1046) (neyama)
+
+**Performance related:**
+
+- perf: Remove one redundant CopyExec for SMJ [#962](https://github.com/apache/datafusion-comet/pull/962) (andygrove)
+- perf: Add experimental feature to replace SortMergeJoin with ShuffledHashJoin [#1007](https://github.com/apache/datafusion-comet/pull/1007) (andygrove)
+- perf: Cache jstrings during metrics collection [#1029](https://github.com/apache/datafusion-comet/pull/1029) (mbutrovich)
+
+**Implemented enhancements:**
+
+- feat: Support `GetArrayStructFields` expression [#993](https://github.com/apache/datafusion-comet/pull/993) (Kimahriman)
+- feat: Implement bloom_filter_agg [#987](https://github.com/apache/datafusion-comet/pull/987) (mbutrovich)
+- feat: Support more types with BloomFilterAgg [#1039](https://github.com/apache/datafusion-comet/pull/1039) (mbutrovich)
+- feat: Implement CAST from struct to string [#1066](https://github.com/apache/datafusion-comet/pull/1066) (andygrove)
+- feat: Use official DataFusion 43 release [#1070](https://github.com/apache/datafusion-comet/pull/1070) (andygrove)
+- feat: Implement CAST between struct types [#1074](https://github.com/apache/datafusion-comet/pull/1074) (andygrove)
+- feat: support array_append [#1072](https://github.com/apache/datafusion-comet/pull/1072) (NoeB)
+- feat: Require offHeap memory to be enabled (always use unified memory) [#1062](https://github.com/apache/datafusion-comet/pull/1062) (andygrove)
+
+**Documentation updates:**
+
+- doc: add documentation interlinks [#975](https://github.com/apache/datafusion-comet/pull/975) (comphead)
+- docs: Add IntelliJ documentation for generated source code [#985](https://github.com/apache/datafusion-comet/pull/985) (mbutrovich)
+- docs: Update tuning guide [#995](https://github.com/apache/datafusion-comet/pull/995) (andygrove)
+- docs: Various documentation improvements [#1005](https://github.com/apache/datafusion-comet/pull/1005) (andygrove)
+- docs: clarify that Maven central only has jars for Linux [#1009](https://github.com/apache/datafusion-comet/pull/1009) (andygrove)
+- doc: fix K8s links and doc [#1058](https://github.com/apache/datafusion-comet/pull/1058) (comphead)
+- docs: Update benchmarking.md [#1085](https://github.com/apache/datafusion-comet/pull/1085) (rluvaton-flarion)
+
+**Other:**
+
+- chore: Generate changelog for 0.3.0 release [#964](https://github.com/apache/datafusion-comet/pull/964) (andygrove)
+- chore: fix publish-to-maven script [#966](https://github.com/apache/datafusion-comet/pull/966) (andygrove)
+- chore: Update benchmarks results based on 0.3.0-rc1 [#969](https://github.com/apache/datafusion-comet/pull/969) (andygrove)
+- chore: update rem expression guide [#976](https://github.com/apache/datafusion-comet/pull/976) (kazuyukitanimura)
+- chore: Enable additional CreateArray tests [#928](https://github.com/apache/datafusion-comet/pull/928) (Kimahriman)
+- chore: fix compatibility guide [#978](https://github.com/apache/datafusion-comet/pull/978) (kazuyukitanimura)
+- chore: Update for 0.3.0 release, prepare for 0.4.0 development [#970](https://github.com/apache/datafusion-comet/pull/970) (andygrove)
+- chore: Don't transform the HashAggregate to CometHashAggregate if Comet shuffle is disabled [#991](https://github.com/apache/datafusion-comet/pull/991) (viirya)
+- chore: Make parquet reader options Comet options instead of Hadoop options [#968](https://github.com/apache/datafusion-comet/pull/968) (parthchandra)
+- chore: remove legacy comet-spark-shell [#1013](https://github.com/apache/datafusion-comet/pull/1013) (andygrove)
+- chore: Reserve memory for native shuffle writer per partition [#988](https://github.com/apache/datafusion-comet/pull/988) (viirya)
+- chore: Bump arrow-rs to 53.1.0 and datafusion [#1001](https://github.com/apache/datafusion-comet/pull/1001) (kazuyukitanimura)
+- chore: Revert "chore: Reserve memory for native shuffle writer per partition (#988)" [#1020](https://github.com/apache/datafusion-comet/pull/1020) (viirya)
+- minor: Remove hard-coded version number from Dockerfile [#1025](https://github.com/apache/datafusion-comet/pull/1025) (andygrove)
+- chore: Reserve memory for native shuffle writer per partition [#1022](https://github.com/apache/datafusion-comet/pull/1022) (viirya)
+- chore: Improve error handling when native lib fails to load [#1000](https://github.com/apache/datafusion-comet/pull/1000) (andygrove)
+- chore: Use twox-hash 2.0 xxhash64 oneshot api instead of custom implementation [#1041](https://github.com/apache/datafusion-comet/pull/1041) (NoeB)
+- chore: Refactor Arrow Array and Schema allocation in ColumnReader and MetadataColumnReader [#1047](https://github.com/apache/datafusion-comet/pull/1047) (viirya)
+- minor: Refactor binary expr serde to reduce code duplication [#1053](https://github.com/apache/datafusion-comet/pull/1053) (andygrove)
+- chore: Upgrade to DataFusion 43.0.0-rc1 [#1057](https://github.com/apache/datafusion-comet/pull/1057) (andygrove)
+- chore: Refactor UnaryExpr and MathExpr in protobuf [#1056](https://github.com/apache/datafusion-comet/pull/1056) (andygrove)
+- minor: use defaults instead of hard-coding values [#1060](https://github.com/apache/datafusion-comet/pull/1060) (andygrove)
+- minor: refactor UnaryExpr handling to make code more concise [#1065](https://github.com/apache/datafusion-comet/pull/1065) (andygrove)
+- chore: Refactor binary and math expression serde code [#1069](https://github.com/apache/datafusion-comet/pull/1069) (andygrove)
+- chore: Simplify CometShuffleMemoryAllocator to use Spark unified memory allocator [#1063](https://github.com/apache/datafusion-comet/pull/1063) (viirya)
+- test: Restore one test in CometExecSuite by adding COMET_SHUFFLE_MODE config [#1087](https://github.com/apache/datafusion-comet/pull/1087) (viirya)
+
+## Credits
+
+Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor.
+
+```
+    19	Andy Grove
+    13	Matt Butrovich
+     8	Liang-Chi Hsieh
+     3	KAZUYUKI TANIMURA
+     2	Adam Binford
+     2	Kristin Cowalcijk
+     1	NoeB
+     1	Oleks V
+     1	Parth Chandra
+     1	neyama
+```
+
+Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release.
diff --git a/dev/diffs/3.4.3.diff b/dev/diffs/3.4.3.diff
@@ -7,7 +7,7 @@ index d3544881af1..bf0e2b53c70 100644
      <ivy.version>2.5.1</ivy.version>
      <oro.version>2.0.8</oro.version>
 +    <spark.version.short>3.4</spark.version.short>
-+    <comet.version>0.4.0-SNAPSHOT</comet.version>
++    <comet.version>0.5.0-SNAPSHOT</comet.version>
      <!--
      If you changes codahale.metrics.version, you also need to change
      the link to metrics.dropwizard.io in docs/monitoring.md.

diff --git a/dev/diffs/3.5.1.diff b/dev/diffs/3.5.1.diff
@@ -7,7 +7,7 @@ index 0f504dbee85..f6019da888a 100644
      <ivy.version>2.5.1</ivy.version>
      <oro.version>2.0.8</oro.version>
 +    <spark.version.short>3.5</spark.version.short>
-+    <comet.version>0.4.0-SNAPSHOT</comet.version>
++    <comet.version>0.5.0-SNAPSHOT</comet.version>
      <!--
      If you changes codahale.metrics.version, you also need to change
      the link to metrics.dropwizard.io in docs/monitoring.md.

diff --git a/dev/diffs/4.0.0-preview1.diff b/dev/diffs/4.0.0-preview1.diff
@@ -7,7 +7,7 @@ index a4b1b2c3c9f..db50bdb0d3b 100644
      <ivy.version>2.5.2</ivy.version>
      <oro.version>2.0.8</oro.version>
 +    <spark.version.short>4.0</spark.version.short>
-+    <comet.version>0.4.0-SNAPSHOT</comet.version>
++    <comet.version>0.5.0-SNAPSHOT</comet.version>
      <!--
      If you change codahale.metrics.version, you also need to change
      the link to metrics.dropwizard.io in docs/monitoring.md.

diff --git a/dev/release/README.md b/dev/release/README.md
@@ -172,6 +172,8 @@ Set up your development environment from here: https://infra.apache.org/publishi
 The script `publish-to-maven.sh` will publish the artifacts created by the `build-release-comet.sh` script.
 The artifacts will be signed using the gpg key of the release manager and uploaded to the maven staging repository.
 
+Note that installed GPG keys can be listed with `gpg --list-keys`. The gpg key is a 40 character hex string.
+
 Note: This script needs `xmllint` to be installed. On MacOS xmllint is available by default.
 
 On Ubuntu `apt-get install -y libxml2-utils`

diff --git a/docs/source/_static/images/benchmark-results/0.4.0/tpcds_allqueries.png b/docs/source/_static/images/benchmark-results/0.4.0/tpcds_allqueries.png
diff --git a/docs/source/_static/images/benchmark-results/0.4.0/tpcds_queries_compare.png b/docs/source/_static/images/benchmark-results/0.4.0/tpcds_queries_compare.png
diff --git a/docs/source/_static/images/benchmark-results/0.4.0/tpcds_queries_speedup_abs.png b/docs/source/_static/images/benchmark-results/0.4.0/tpcds_queries_speedup_abs.png
diff --git a/docs/source/_static/images/benchmark-results/0.4.0/tpcds_queries_speedup_rel.png b/docs/source/_static/images/benchmark-results/0.4.0/tpcds_queries_speedup_rel.png
diff --git a/docs/source/_static/images/benchmark-results/0.4.0/tpch_allqueries.png b/docs/source/_static/images/benchmark-results/0.4.0/tpch_allqueries.png
diff --git a/docs/source/_static/images/benchmark-results/0.4.0/tpch_queries_compare.png b/docs/source/_static/images/benchmark-results/0.4.0/tpch_queries_compare.png
diff --git a/docs/source/_static/images/benchmark-results/0.4.0/tpch_queries_speedup_abs.png b/docs/source/_static/images/benchmark-results/0.4.0/tpch_queries_speedup_abs.png
diff --git a/docs/source/_static/images/benchmark-results/0.4.0/tpch_queries_speedup_rel.png b/docs/source/_static/images/benchmark-results/0.4.0/tpch_queries_speedup_rel.png
diff --git a/docs/source/contributor-guide/benchmark-results/0.3.0/datafusion-python-tpch.json b/docs/source/contributor-guide/benchmark-results/0.3.0/datafusion-python-tpch.json