Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into use_official_releases
Browse files Browse the repository at this point in the history
  • Loading branch information
viirya committed Jun 6, 2024
2 parents 3c7452f + 6143e7a commit 2c03b8a
Show file tree
Hide file tree
Showing 395 changed files with 67,690 additions and 2,486 deletions.
2 changes: 1 addition & 1 deletion .github/actions/setup-builder/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ inputs:
rust-version:
description: 'version of rust to install (e.g. nightly)'
required: true
default: 'nightly'
default: 'stable'
jdk-version:
description: 'jdk version to install (e.g., 17)'
required: true
Expand Down
2 changes: 1 addition & 1 deletion .github/actions/setup-macos-builder/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ inputs:
rust-version:
description: 'version of rust to install (e.g. nightly)'
required: true
default: 'nightly'
default: 'stable'
jdk-version:
description: 'jdk version to install (e.g., 17)'
required: true
Expand Down
4 changes: 2 additions & 2 deletions .github/actions/setup-spark-builder/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ inputs:
required: true
default: '3.4'
spark-version:
description: 'The Apache Spark version (e.g., 3.4.2) to build'
description: 'The Apache Spark version (e.g., 3.4.3) to build'
required: true
default: '3.4.2'
default: '3.4.3'
comet-version:
description: 'The Comet version to use for Spark'
required: true
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/benchmark-tpch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ on:
workflow_dispatch:

env:
RUST_VERSION: nightly
RUST_VERSION: stable

jobs:
prepare:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ on:
workflow_dispatch:

env:
RUST_VERSION: nightly
RUST_VERSION: stable

jobs:
prepare:
Expand Down
82 changes: 81 additions & 1 deletion .github/workflows/pr_build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ on:
workflow_dispatch:

env:
RUST_VERSION: nightly
RUST_VERSION: stable

jobs:
linux-test:
Expand Down Expand Up @@ -76,6 +76,33 @@ jobs:
# upload test reports only for java 17
upload-test-reports: ${{ matrix.java_version == '17' }}

linux-test-with-spark4_0:
strategy:
matrix:
os: [ubuntu-latest]
java_version: [17]
test-target: [java]
spark-version: ['4.0']
is_push_event:
- ${{ github.event_name == 'push' }}
fail-fast: false
name: ${{ matrix.os }}/java ${{ matrix.java_version }}-spark-${{matrix.spark-version}}/${{ matrix.test-target }}
runs-on: ${{ matrix.os }}
container:
image: amd64/rust
steps:
- uses: actions/checkout@v4
- name: Setup Rust & Java toolchain
uses: ./.github/actions/setup-builder
with:
rust-version: ${{env.RUST_VERSION}}
jdk-version: ${{ matrix.java_version }}
- name: Java test steps
uses: ./.github/actions/java-test
with:
maven_opts: -Pspark-${{ matrix.spark-version }}
upload-test-reports: true

linux-test-with-old-spark:
strategy:
matrix:
Expand Down Expand Up @@ -169,6 +196,59 @@ jobs:
with:
maven_opts: -Pspark-${{ matrix.spark-version }},scala-${{ matrix.scala-version }}

macos-test-with-spark4_0:
strategy:
matrix:
os: [macos-13]
java_version: [17]
test-target: [java]
spark-version: ['4.0']
fail-fast: false
if: github.event_name == 'push'
name: ${{ matrix.os }}/java ${{ matrix.java_version }}-spark-${{matrix.spark-version}}/${{ matrix.test-target }}
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v4
- name: Setup Rust & Java toolchain
uses: ./.github/actions/setup-macos-builder
with:
rust-version: ${{env.RUST_VERSION}}
jdk-version: ${{ matrix.java_version }}
- name: Java test steps
uses: ./.github/actions/java-test
with:
maven_opts: -Pspark-${{ matrix.spark-version }}
upload-test-reports: true

macos-aarch64-test-with-spark4_0:
strategy:
matrix:
java_version: [17]
test-target: [java]
spark-version: ['4.0']
is_push_event:
- ${{ github.event_name == 'push' }}
exclude: # exclude java 11 for pull_request event
- java_version: 11
is_push_event: false
fail-fast: false
name: macos-14(Silicon)/java ${{ matrix.java_version }}-spark-${{matrix.spark-version}}/${{ matrix.test-target }}
runs-on: macos-14
steps:
- uses: actions/checkout@v4
- name: Setup Rust & Java toolchain
uses: ./.github/actions/setup-macos-builder
with:
rust-version: ${{env.RUST_VERSION}}
jdk-version: ${{ matrix.java_version }}
jdk-architecture: aarch64
protoc-architecture: aarch_64
- name: Java test steps
uses: ./.github/actions/java-test
with:
maven_opts: -Pspark-${{ matrix.spark-version }}
upload-test-reports: true

macos-aarch64-test-with-old-spark:
strategy:
matrix:
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/spark_sql_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,15 +37,15 @@ on:
workflow_dispatch:

env:
RUST_VERSION: nightly
RUST_VERSION: stable

jobs:
spark-sql-catalyst:
strategy:
matrix:
os: [ubuntu-latest]
java-version: [11]
spark-version: [{short: '3.4', full: '3.4.2'}]
spark-version: [{short: '3.4', full: '3.4.3'}]
module:
- {name: "catalyst", args1: "catalyst/test", args2: ""}
- {name: "sql/core-1", args1: "", args2: sql/testOnly * -- -l org.apache.spark.tags.ExtendedSQLTest -l org.apache.spark.tags.SlowSQLTest}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/spark_sql_test_ansi.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ on:
workflow_dispatch:

env:
RUST_VERSION: nightly
RUST_VERSION: stable

jobs:
spark-sql-catalyst:
Expand Down
19 changes: 11 additions & 8 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,10 @@ format:

core-amd64:
rustup target add x86_64-apple-darwin
cd core && RUSTFLAGS="-Ctarget-cpu=skylake -Ctarget-feature=-prefer-256-bit" CC=o64-clang CXX=o64-clang++ cargo build --target x86_64-apple-darwin --features nightly --release
cd core && RUSTFLAGS="-Ctarget-cpu=skylake -Ctarget-feature=-prefer-256-bit" CC=o64-clang CXX=o64-clang++ cargo build --target x86_64-apple-darwin --release
mkdir -p common/target/classes/org/apache/comet/darwin/x86_64
cp core/target/x86_64-apple-darwin/release/libcomet.dylib common/target/classes/org/apache/comet/darwin/x86_64
cd core && RUSTFLAGS="-Ctarget-cpu=haswell -Ctarget-feature=-prefer-256-bit" cargo build --features nightly --release
cd core && RUSTFLAGS="-Ctarget-cpu=haswell -Ctarget-feature=-prefer-256-bit" cargo build --release
mkdir -p common/target/classes/org/apache/comet/linux/amd64
cp core/target/release/libcomet.so common/target/classes/org/apache/comet/linux/amd64
jar -cf common/target/comet-native-x86_64.jar \
Expand All @@ -57,10 +57,10 @@ core-amd64:

core-arm64:
rustup target add aarch64-apple-darwin
cd core && RUSTFLAGS="-Ctarget-cpu=apple-m1" CC=arm64-apple-darwin21.4-clang CXX=arm64-apple-darwin21.4-clang++ CARGO_FEATURE_NEON=1 cargo build --target aarch64-apple-darwin --features nightly --release
cd core && RUSTFLAGS="-Ctarget-cpu=apple-m1" CC=arm64-apple-darwin21.4-clang CXX=arm64-apple-darwin21.4-clang++ CARGO_FEATURE_NEON=1 cargo build --target aarch64-apple-darwin --release
mkdir -p common/target/classes/org/apache/comet/darwin/aarch64
cp core/target/aarch64-apple-darwin/release/libcomet.dylib common/target/classes/org/apache/comet/darwin/aarch64
cd core && RUSTFLAGS="-Ctarget-cpu=native" cargo build --features nightly --release
cd core && RUSTFLAGS="-Ctarget-cpu=native" cargo build --release
mkdir -p common/target/classes/org/apache/comet/linux/aarch64
cp core/target/release/libcomet.so common/target/classes/org/apache/comet/linux/aarch64
jar -cf common/target/comet-native-aarch64.jar \
Expand All @@ -70,13 +70,16 @@ core-arm64:

release-linux: clean
rustup target add aarch64-apple-darwin x86_64-apple-darwin
cd core && RUSTFLAGS="-Ctarget-cpu=apple-m1" CC=arm64-apple-darwin21.4-clang CXX=arm64-apple-darwin21.4-clang++ CARGO_FEATURE_NEON=1 cargo build --target aarch64-apple-darwin --features nightly --release
cd core && RUSTFLAGS="-Ctarget-cpu=skylake -Ctarget-feature=-prefer-256-bit" CC=o64-clang CXX=o64-clang++ cargo build --target x86_64-apple-darwin --features nightly --release
cd core && RUSTFLAGS="-Ctarget-cpu=native -Ctarget-feature=-prefer-256-bit" cargo build --features nightly --release
cd core && RUSTFLAGS="-Ctarget-cpu=apple-m1" CC=arm64-apple-darwin21.4-clang CXX=arm64-apple-darwin21.4-clang++ CARGO_FEATURE_NEON=1 cargo build --target aarch64-apple-darwin --release
cd core && RUSTFLAGS="-Ctarget-cpu=skylake -Ctarget-feature=-prefer-256-bit" CC=o64-clang CXX=o64-clang++ cargo build --target x86_64-apple-darwin --release
cd core && RUSTFLAGS="-Ctarget-cpu=native -Ctarget-feature=-prefer-256-bit" cargo build --release
./mvnw install -Prelease -DskipTests $(PROFILES)
release:
cd core && RUSTFLAGS="-Ctarget-cpu=native" cargo build --features nightly --release
cd core && RUSTFLAGS="-Ctarget-cpu=native" cargo build --release
./mvnw install -Prelease -DskipTests $(PROFILES)
release-nogit:
cd core && RUSTFLAGS="-Ctarget-cpu=native" cargo build --features nightly --release
./mvnw install -Prelease -DskipTests $(PROFILES) -Dmaven.gitcommitid.skip=true
benchmark-%: clean release
cd spark && COMET_CONF_DIR=$(shell pwd)/conf MAVEN_OPTS='-Xmx20g' ../mvnw exec:java -Dexec.mainClass="$*" -Dexec.classpathScope="test" -Dexec.cleanupDaemonThreads="false" -Dexec.args="$(filter-out $@,$(MAKECMDGOALS))" $(PROFILES)
.DEFAULT:
Expand Down
98 changes: 63 additions & 35 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,58 +19,86 @@ under the License.

# Apache DataFusion Comet

Apache DataFusion Comet is an Apache Spark plugin that uses [Apache DataFusion](https://datafusion.apache.org/)
as native runtime to achieve improvement in terms of query efficiency and query runtime.
Apache DataFusion Comet is a high-performance accelerator for Apache Spark, built on top of the powerful
[Apache DataFusion](https://datafusion.apache.org) query engine. Comet is designed to significantly enhance the
performance of Apache Spark workloads while leveraging commodity hardware and seamlessly integrating with the
Spark ecosystem without requiring any code changes.

Comet runs Spark SQL queries using the native DataFusion runtime, which is
typically faster and more resource efficient than JVM based runtimes.
# Benefits of Using Comet

<a href="docs/source/_static/images/comet-overview.png"><img src="docs/source/_static/images/comet-system-diagram.png" align="center" width="500" ></a>
## Run Spark Queries at DataFusion Speeds

Comet aims to support:
Comet delivers a performance speedup for many queries, enabling faster data processing and shorter time-to-insights.

- a native Parquet implementation, including both reader and writer
- full implementation of Spark operators, including
Filter/Project/Aggregation/Join/Exchange etc.
- full implementation of Spark built-in expressions
- a UDF framework for users to migrate their existing UDF to native
The following chart shows the time it takes to run the 22 TPC-H queries against 100 GB of data in Parquet format
using a single executor with 8 cores. See the [Comet Benchmarking Guide](https://datafusion.apache.org/comet/contributor-guide/benchmarking.html)
for details of the environment used for these benchmarks.

## Architecture
When using Comet, the overall run time is reduced from 649 seconds to 440 seconds, a 1.5x speedup.

The following diagram illustrates the architecture of Comet:
Running the same queries with DataFusion standalone (without Spark) using the same number of cores results in a 3.9x
speedup compared to Spark.

<a href="docs/source/_static/images/comet-overview.png"><img src="docs/source/_static/images/comet-overview.png" align="center" height="600" width="750" ></a>
Comet is not yet achieving full DataFusion speeds in all cases, but with future work we aim to provide a 2x-4x speedup
for many use cases.

## Current Status
![](docs/source/_static/images/tpch_allqueries.png)

The project is currently integrated into Apache Spark 3.2, 3.3, and 3.4.
Here is a breakdown showing relative performance of Spark, Comet, and DataFusion for each TPC-H query.

## Feature Parity with Apache Spark
![](docs/source/_static/images/tpch_queries_compare.png)

The project strives to keep feature parity with Apache Spark, that is,
users should expect the same behavior (w.r.t features, configurations,
query results, etc) with Comet turned on or turned off in their Spark
jobs. In addition, Comet extension should automatically detect unsupported
features and fallback to Spark engine.
The following chart shows how much Comet currently accelerates each query from the benchmark. Performance optimization
is an ongoing task, and we welcome contributions from the community to help achieve even greater speedups in the future.

To achieve this, besides unit tests within Comet itself, we also re-use
Spark SQL tests and make sure they all pass with Comet extension
enabled.
![](docs/source/_static/images/tpch_queries_speedup.png)

## Supported Platforms
These benchmarks can be reproduced in any environment using the documentation in the
[Comet Benchmarking Guide](https://datafusion.apache.org/comet/contributor-guide/benchmarking.html). We encourage
you to run your own benchmarks.

Linux, Apple OSX (Intel and M1)
## Use Commodity Hardware

## Requirements
Comet leverages commodity hardware, eliminating the need for costly hardware upgrades or
specialized hardware accelerators, such as GPUs or FGPA. By maximizing the utilization of commodity hardware, Comet
ensures cost-effectiveness and scalability for your Spark deployments.

- Apache Spark 3.2, 3.3, or 3.4
- JDK 8, 11 and 17 (JDK 11 recommended because Spark 3.2 doesn't support 17)
- GLIBC 2.17 (Centos 7) and up
## Spark Compatibility

## Getting started
Comet aims for 100% compatibility with all supported versions of Apache Spark, allowing you to integrate Comet into
your existing Spark deployments and workflows seamlessly. With no code changes required, you can immediately harness
the benefits of Comet's acceleration capabilities without disrupting your Spark applications.

See the [DataFusion Comet User Guide](https://datafusion.apache.org/comet/user-guide/installation.html) for installation instructions.
## Tight Integration with Apache DataFusion

Comet tightly integrates with the core Apache DataFusion project, leveraging its powerful execution engine. With
seamless interoperability between Comet and DataFusion, you can achieve optimal performance and efficiency in your
Spark workloads.

## Active Community

Comet boasts a vibrant and active community of developers, contributors, and users dedicated to advancing the
capabilities of Apache DataFusion and accelerating the performance of Apache Spark.

## Getting Started

To get started with Apache DataFusion Comet, follow the
[installation instructions](https://datafusion.apache.org/comet/user-guide/installation.html). Join the
[DataFusion Slack and Discord channels](https://datafusion.apache.org/contributor-guide/communication.html) to connect
with other users, ask questions, and share your experiences with Comet.

## Contributing
See the [DataFusion Comet Contribution Guide](https://datafusion.apache.org/comet/contributor-guide/contributing.html)
for information on how to get started contributing to the project.

We welcome contributions from the community to help improve and enhance Apache DataFusion Comet. Whether it's fixing
bugs, adding new features, writing documentation, or optimizing performance, your contributions are invaluable in
shaping the future of Comet. Check out our
[contributor guide](https://datafusion.apache.org/comet/contributor-guide/contributing.html) to get started.

## License

Apache DataFusion Comet is licensed under the Apache License 2.0. See the [LICENSE.txt](LICENSE.txt) file for details.

## Acknowledgments

We would like to express our gratitude to the Apache DataFusion community for their support and contributions to
Comet. Together, we're building a faster, more efficient future for big data processing with Apache Spark.
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,11 @@ public CometDecodedVector loadVector() {
Dictionary arrowDictionary = importer.getProvider().lookup(dictionaryEncoding.getId());
CometPlainVector dictionaryVector =
new CometPlainVector(arrowDictionary.getVector(), useDecimal128, isUuid);
dictionary = new CometDictionary(dictionaryVector);
if (dictionary != null) {
dictionary.setDictionaryVector(dictionaryVector);
} else {
dictionary = new CometDictionary(dictionaryVector);
}

currentVector =
new CometDictionaryVector(
Expand Down
Loading

0 comments on commit 2c03b8a

Please sign in to comment.