Merge remote-tracking branch 'upstream/main' into use_official_releases

viirya · Jun 6, 2024 · 2c03b8a · 2c03b8a
2 parents 3c7452f + 6143e7a
commit 2c03b8a
Show file tree

Hide file tree

Showing 395 changed files with 67,690 additions and 2,486 deletions.
diff --git a/.github/actions/setup-builder/action.yaml b/.github/actions/setup-builder/action.yaml
@@ -21,7 +21,7 @@ inputs:
   rust-version:
     description: 'version of rust to install (e.g. nightly)'
     required: true
-    default: 'nightly'
+    default: 'stable'
   jdk-version:
     description: 'jdk version to install (e.g., 17)'
     required: true

diff --git a/.github/actions/setup-macos-builder/action.yaml b/.github/actions/setup-macos-builder/action.yaml
@@ -21,7 +21,7 @@ inputs:
   rust-version:
     description: 'version of rust to install (e.g. nightly)'
     required: true
-    default: 'nightly'
+    default: 'stable'
   jdk-version:
     description: 'jdk version to install (e.g., 17)'
     required: true

diff --git a/.github/actions/setup-spark-builder/action.yaml b/.github/actions/setup-spark-builder/action.yaml
@@ -23,9 +23,9 @@ inputs:
     required: true
     default: '3.4'
   spark-version:
-    description: 'The Apache Spark version (e.g., 3.4.2) to build'
+    description: 'The Apache Spark version (e.g., 3.4.3) to build'
     required: true
-    default: '3.4.2'
+    default: '3.4.3'
   comet-version:
     description: 'The Comet version to use for Spark'
     required: true

diff --git a/.github/workflows/benchmark-tpch.yml b/.github/workflows/benchmark-tpch.yml
@@ -37,7 +37,7 @@ on:
   workflow_dispatch:
 
 env:
-  RUST_VERSION: nightly
+  RUST_VERSION: stable
 
 jobs:
   prepare:

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -37,7 +37,7 @@ on:
   workflow_dispatch:
 
 env:
-  RUST_VERSION: nightly
+  RUST_VERSION: stable
 
 jobs:
   prepare:

diff --git a/.github/workflows/pr_build.yml b/.github/workflows/pr_build.yml
@@ -37,7 +37,7 @@ on:
   workflow_dispatch:
 
 env:
-  RUST_VERSION: nightly
+  RUST_VERSION: stable
 
 jobs:
   linux-test:
@@ -76,6 +76,33 @@ jobs:
           # upload test reports only for java 17
           upload-test-reports: ${{ matrix.java_version == '17' }}
 
+  linux-test-with-spark4_0:
+    strategy:
+      matrix:
+        os: [ubuntu-latest]
+        java_version: [17]
+        test-target: [java]
+        spark-version: ['4.0']
+        is_push_event:
+          - ${{ github.event_name == 'push' }}
+      fail-fast: false
+    name: ${{ matrix.os }}/java ${{ matrix.java_version }}-spark-${{matrix.spark-version}}/${{ matrix.test-target }}
+    runs-on: ${{ matrix.os }}
+    container:
+      image: amd64/rust
+    steps:
+      - uses: actions/checkout@v4
+      - name: Setup Rust & Java toolchain
+        uses: ./.github/actions/setup-builder
+        with:
+          rust-version: ${{env.RUST_VERSION}}
+          jdk-version: ${{ matrix.java_version }}
+      - name: Java test steps
+        uses: ./.github/actions/java-test
+        with:
+          maven_opts: -Pspark-${{ matrix.spark-version }}
+          upload-test-reports: true
+
   linux-test-with-old-spark:
     strategy:
       matrix:
@@ -169,6 +196,59 @@ jobs:
         with:
           maven_opts: -Pspark-${{ matrix.spark-version }},scala-${{ matrix.scala-version }}
 
+  macos-test-with-spark4_0:
+    strategy:
+      matrix:
+        os: [macos-13]
+        java_version: [17]
+        test-target: [java]
+        spark-version: ['4.0']
+      fail-fast: false
+    if: github.event_name == 'push'
+    name: ${{ matrix.os }}/java ${{ matrix.java_version }}-spark-${{matrix.spark-version}}/${{ matrix.test-target }}
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v4
+      - name: Setup Rust & Java toolchain
+        uses: ./.github/actions/setup-macos-builder
+        with:
+          rust-version: ${{env.RUST_VERSION}}
+          jdk-version: ${{ matrix.java_version }}
+      - name: Java test steps
+        uses: ./.github/actions/java-test
+        with:
+          maven_opts: -Pspark-${{ matrix.spark-version }}
+          upload-test-reports: true
+
+  macos-aarch64-test-with-spark4_0:
+    strategy:
+      matrix:
+        java_version: [17]
+        test-target: [java]
+        spark-version: ['4.0']
+        is_push_event:
+          - ${{ github.event_name == 'push' }}
+        exclude: # exclude java 11 for pull_request event
+          - java_version: 11
+            is_push_event: false
+      fail-fast: false
+    name: macos-14(Silicon)/java ${{ matrix.java_version }}-spark-${{matrix.spark-version}}/${{ matrix.test-target }}
+    runs-on: macos-14
+    steps:
+      - uses: actions/checkout@v4
+      - name: Setup Rust & Java toolchain
+        uses: ./.github/actions/setup-macos-builder
+        with:
+          rust-version: ${{env.RUST_VERSION}}
+          jdk-version: ${{ matrix.java_version }}
+          jdk-architecture: aarch64
+          protoc-architecture: aarch_64
+      - name: Java test steps
+        uses: ./.github/actions/java-test
+        with:
+          maven_opts: -Pspark-${{ matrix.spark-version }}
+          upload-test-reports: true
+
   macos-aarch64-test-with-old-spark:
     strategy:
       matrix:

diff --git a/.github/workflows/spark_sql_test.yml b/.github/workflows/spark_sql_test.yml
@@ -37,15 +37,15 @@ on:
   workflow_dispatch:
 
 env:
-  RUST_VERSION: nightly
+  RUST_VERSION: stable
 
 jobs:
   spark-sql-catalyst:
     strategy:
       matrix:
         os: [ubuntu-latest]
         java-version: [11]
-        spark-version: [{short: '3.4', full: '3.4.2'}]
+        spark-version: [{short: '3.4', full: '3.4.3'}]
         module:
           - {name: "catalyst", args1: "catalyst/test", args2: ""}
           - {name: "sql/core-1", args1: "", args2: sql/testOnly * -- -l org.apache.spark.tags.ExtendedSQLTest -l org.apache.spark.tags.SlowSQLTest}

diff --git a/.github/workflows/spark_sql_test_ansi.yml b/.github/workflows/spark_sql_test_ansi.yml
@@ -37,7 +37,7 @@ on:
   workflow_dispatch:
 
 env:
-  RUST_VERSION: nightly
+  RUST_VERSION: stable
 
 jobs:
   spark-sql-catalyst:

diff --git a/Makefile b/Makefile
@@ -44,10 +44,10 @@ format:
 
 core-amd64:
 	rustup target add x86_64-apple-darwin
-	cd core && RUSTFLAGS="-Ctarget-cpu=skylake -Ctarget-feature=-prefer-256-bit" CC=o64-clang CXX=o64-clang++ cargo build --target x86_64-apple-darwin --features nightly --release
+	cd core && RUSTFLAGS="-Ctarget-cpu=skylake -Ctarget-feature=-prefer-256-bit" CC=o64-clang CXX=o64-clang++ cargo build --target x86_64-apple-darwin --release
 	mkdir -p common/target/classes/org/apache/comet/darwin/x86_64
 	cp core/target/x86_64-apple-darwin/release/libcomet.dylib common/target/classes/org/apache/comet/darwin/x86_64
-	cd core && RUSTFLAGS="-Ctarget-cpu=haswell -Ctarget-feature=-prefer-256-bit" cargo build --features nightly --release
+	cd core && RUSTFLAGS="-Ctarget-cpu=haswell -Ctarget-feature=-prefer-256-bit" cargo build --release
 	mkdir -p common/target/classes/org/apache/comet/linux/amd64
 	cp core/target/release/libcomet.so common/target/classes/org/apache/comet/linux/amd64
 	jar -cf common/target/comet-native-x86_64.jar \
@@ -57,10 +57,10 @@ core-amd64:
 
 core-arm64:
 	rustup target add aarch64-apple-darwin
-	cd core && RUSTFLAGS="-Ctarget-cpu=apple-m1" CC=arm64-apple-darwin21.4-clang CXX=arm64-apple-darwin21.4-clang++ CARGO_FEATURE_NEON=1 cargo build --target aarch64-apple-darwin --features nightly --release
+	cd core && RUSTFLAGS="-Ctarget-cpu=apple-m1" CC=arm64-apple-darwin21.4-clang CXX=arm64-apple-darwin21.4-clang++ CARGO_FEATURE_NEON=1 cargo build --target aarch64-apple-darwin --release
 	mkdir -p common/target/classes/org/apache/comet/darwin/aarch64
 	cp core/target/aarch64-apple-darwin/release/libcomet.dylib common/target/classes/org/apache/comet/darwin/aarch64
-	cd core && RUSTFLAGS="-Ctarget-cpu=native" cargo build --features nightly --release
+	cd core && RUSTFLAGS="-Ctarget-cpu=native" cargo build --release
 	mkdir -p common/target/classes/org/apache/comet/linux/aarch64
 	cp core/target/release/libcomet.so common/target/classes/org/apache/comet/linux/aarch64
 	jar -cf common/target/comet-native-aarch64.jar \
@@ -70,13 +70,16 @@ core-arm64:
 
 release-linux: clean
 	rustup target add aarch64-apple-darwin x86_64-apple-darwin
-	cd core && RUSTFLAGS="-Ctarget-cpu=apple-m1" CC=arm64-apple-darwin21.4-clang CXX=arm64-apple-darwin21.4-clang++ CARGO_FEATURE_NEON=1 cargo build --target aarch64-apple-darwin --features nightly --release
-	cd core && RUSTFLAGS="-Ctarget-cpu=skylake -Ctarget-feature=-prefer-256-bit" CC=o64-clang CXX=o64-clang++ cargo build --target x86_64-apple-darwin --features nightly --release
-	cd core && RUSTFLAGS="-Ctarget-cpu=native -Ctarget-feature=-prefer-256-bit" cargo build --features nightly --release
+	cd core && RUSTFLAGS="-Ctarget-cpu=apple-m1" CC=arm64-apple-darwin21.4-clang CXX=arm64-apple-darwin21.4-clang++ CARGO_FEATURE_NEON=1 cargo build --target aarch64-apple-darwin --release
+	cd core && RUSTFLAGS="-Ctarget-cpu=skylake -Ctarget-feature=-prefer-256-bit" CC=o64-clang CXX=o64-clang++ cargo build --target x86_64-apple-darwin --release
+	cd core && RUSTFLAGS="-Ctarget-cpu=native -Ctarget-feature=-prefer-256-bit" cargo build --release
 	./mvnw install -Prelease -DskipTests $(PROFILES)
 release:
-	cd core && RUSTFLAGS="-Ctarget-cpu=native" cargo build --features nightly --release
+	cd core && RUSTFLAGS="-Ctarget-cpu=native" cargo build --release
 	./mvnw install -Prelease -DskipTests $(PROFILES)
+release-nogit:
+	cd core && RUSTFLAGS="-Ctarget-cpu=native" cargo build --features nightly --release
+	./mvnw install -Prelease -DskipTests $(PROFILES) -Dmaven.gitcommitid.skip=true
 benchmark-%: clean release
 	cd spark && COMET_CONF_DIR=$(shell pwd)/conf MAVEN_OPTS='-Xmx20g' ../mvnw exec:java -Dexec.mainClass="$*" -Dexec.classpathScope="test" -Dexec.cleanupDaemonThreads="false" -Dexec.args="$(filter-out $@,$(MAKECMDGOALS))" $(PROFILES)
 .DEFAULT:

diff --git a/README.md b/README.md
@@ -19,58 +19,86 @@ under the License.
 
 # Apache DataFusion Comet
 
-Apache DataFusion Comet is an Apache Spark plugin that uses [Apache DataFusion](https://datafusion.apache.org/)
-as native runtime to achieve improvement in terms of query efficiency and query runtime.
+Apache DataFusion Comet is a high-performance accelerator for Apache Spark, built on top of the powerful
+[Apache DataFusion](https://datafusion.apache.org) query engine. Comet is designed to significantly enhance the
+performance of Apache Spark workloads while leveraging commodity hardware and seamlessly integrating with the
+Spark ecosystem without requiring any code changes.
 
-Comet runs Spark SQL queries using the native DataFusion runtime, which is
-typically faster and more resource efficient than JVM based runtimes.
+# Benefits of Using Comet
 
-<a href="docs/source/_static/images/comet-overview.png"><img src="docs/source/_static/images/comet-system-diagram.png" align="center" width="500" ></a>
+## Run Spark Queries at DataFusion Speeds
 
-Comet aims to support:
+Comet delivers a performance speedup for many queries, enabling faster data processing and shorter time-to-insights.
 
-- a native Parquet implementation, including both reader and writer
-- full implementation of Spark operators, including
-  Filter/Project/Aggregation/Join/Exchange etc.
-- full implementation of Spark built-in expressions
-- a UDF framework for users to migrate their existing UDF to native
+The following chart shows the time it takes to run the 22 TPC-H queries against 100 GB of data in Parquet format 
+using a single executor with 8 cores. See the [Comet Benchmarking Guide](https://datafusion.apache.org/comet/contributor-guide/benchmarking.html)
+for details of the environment used for these benchmarks.
 
-## Architecture
+When using Comet, the overall run time is reduced from 649 seconds to 440 seconds, a 1.5x speedup.
 
-The following diagram illustrates the architecture of Comet:
+Running the same queries with DataFusion standalone (without Spark) using the same number of cores results in a 3.9x 
+speedup compared to Spark.
 
-<a href="docs/source/_static/images/comet-overview.png"><img src="docs/source/_static/images/comet-overview.png" align="center" height="600" width="750" ></a>
+Comet is not yet achieving full DataFusion speeds in all cases, but with future work we aim to provide a 2x-4x speedup 
+for many use cases.
 
-## Current Status
+![](docs/source/_static/images/tpch_allqueries.png)
 
-The project is currently integrated into Apache Spark 3.2, 3.3, and 3.4.
+Here is a breakdown showing relative performance of Spark, Comet, and DataFusion for each TPC-H query.
 
-## Feature Parity with Apache Spark
+![](docs/source/_static/images/tpch_queries_compare.png)
 
-The project strives to keep feature parity with Apache Spark, that is,
-users should expect the same behavior (w.r.t features, configurations,
-query results, etc) with Comet turned on or turned off in their Spark
-jobs. In addition, Comet extension should automatically detect unsupported
-features and fallback to Spark engine.
+The following chart shows how much Comet currently accelerates each query from the benchmark. Performance optimization
+is an ongoing task, and we welcome contributions from the community to help achieve even greater speedups in the future.
 
-To achieve this, besides unit tests within Comet itself, we also re-use
-Spark SQL tests and make sure they all pass with Comet extension
-enabled.
+![](docs/source/_static/images/tpch_queries_speedup.png)
 
-## Supported Platforms
+These benchmarks can be reproduced in any environment using the documentation in the 
+[Comet Benchmarking Guide](https://datafusion.apache.org/comet/contributor-guide/benchmarking.html). We encourage 
+you to run your own benchmarks.
 
-Linux, Apple OSX (Intel and M1)
+## Use Commodity Hardware
 
-## Requirements
+Comet leverages commodity hardware, eliminating the need for costly hardware upgrades or
+specialized hardware accelerators, such as GPUs or FGPA. By maximizing the utilization of commodity hardware, Comet 
+ensures cost-effectiveness and scalability for your Spark deployments.
 
-- Apache Spark 3.2, 3.3, or 3.4
-- JDK 8, 11 and 17 (JDK 11 recommended because Spark 3.2 doesn't support 17)
-- GLIBC 2.17 (Centos 7) and up
+## Spark Compatibility
 
-## Getting started
+Comet aims for 100% compatibility with all supported versions of Apache Spark, allowing you to integrate Comet into
+your existing Spark deployments and workflows seamlessly. With no code changes required, you can immediately harness
+the benefits of Comet's acceleration capabilities without disrupting your Spark applications.
 
-See the [DataFusion Comet User Guide](https://datafusion.apache.org/comet/user-guide/installation.html) for installation instructions.
+## Tight Integration with Apache DataFusion
+
+Comet tightly integrates with the core Apache DataFusion project, leveraging its powerful execution engine. With
+seamless interoperability between Comet and DataFusion, you can achieve optimal performance and efficiency in your
+Spark workloads.
+
+## Active Community
+
+Comet boasts a vibrant and active community of developers, contributors, and users dedicated to advancing the
+capabilities of Apache DataFusion and accelerating the performance of Apache Spark.
+
+## Getting Started
+
+To get started with Apache DataFusion Comet, follow the
+[installation instructions](https://datafusion.apache.org/comet/user-guide/installation.html). Join the
+[DataFusion Slack and Discord channels](https://datafusion.apache.org/contributor-guide/communication.html) to connect
+with other users, ask questions, and share your experiences with Comet.
 
 ## Contributing
-See the [DataFusion Comet Contribution Guide](https://datafusion.apache.org/comet/contributor-guide/contributing.html)
-for information on how to get started contributing to the project.
+
+We welcome contributions from the community to help improve and enhance Apache DataFusion Comet. Whether it's fixing
+bugs, adding new features, writing documentation, or optimizing performance, your contributions are invaluable in
+shaping the future of Comet. Check out our
+[contributor guide](https://datafusion.apache.org/comet/contributor-guide/contributing.html) to get started.
+
+## License
+
+Apache DataFusion Comet is licensed under the Apache License 2.0. See the [LICENSE.txt](LICENSE.txt) file for details.
+
+## Acknowledgments
+
+We would like to express our gratitude to the Apache DataFusion community for their support and contributions to
+Comet. Together, we're building a faster, more efficient future for big data processing with Apache Spark.
diff --git a/common/src/main/java/org/apache/comet/parquet/ColumnReader.java b/common/src/main/java/org/apache/comet/parquet/ColumnReader.java
@@ -234,7 +234,11 @@ public CometDecodedVector loadVector() {
       Dictionary arrowDictionary = importer.getProvider().lookup(dictionaryEncoding.getId());
       CometPlainVector dictionaryVector =
           new CometPlainVector(arrowDictionary.getVector(), useDecimal128, isUuid);
-      dictionary = new CometDictionary(dictionaryVector);
+      if (dictionary != null) {
+        dictionary.setDictionaryVector(dictionaryVector);
+      } else {
+        dictionary = new CometDictionary(dictionaryVector);
+      }
 
       currentVector =
           new CometDictionaryVector(